Merge pull request #10 from Zanzarchik/master

Add an implementation of the TokenFilterFactory for the MorphologyFilter
This commit is contained in:
Alexander Kuznetsov 2017-01-25 16:53:23 +03:00 committed by GitHub
commit b180862572
5 changed files with 202 additions and 0 deletions

View File

@ -62,6 +62,21 @@ Also if you need get a list of base forms of word, you can use following example
LuceneMorphology luceneMorph = new EnglishLuceneMorphology();
List<String> wordBaseForms = luceneMorph.getMorphInfo(word);
### Solr
You can use the LuceneMorphology as morphology filter in a Solr _schema.xml_ using a **MorphologyFilterFactory:**
```xml
<fieldType name="content" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="org.apache.lucene.analysis.morphology.MorphologyFilterFactory" language="Russian"/>
<filter class="org.apache.lucene.analysis.morphology.MorphologyFilterFactory" language="English"/>
</analyzer>
</fieldType>
```
Just add _morphology-1.3.jar_ in your Solr lib-directories
### Restrictions

View File

@ -17,6 +17,8 @@
<properties>
<lucene.version>6.2.0</lucene.version>
<morphology.version>1.3-SNAPSHOT</morphology.version>
<junit.version>4.8.2</junit.version>
</properties>
<distributionManagement>
@ -168,5 +170,6 @@
<module>dictionary-reader</module>
<module>russian</module>
<module>english</module>
<module>solr-morphology-analysis</module>
</modules>
</project>

View File

@ -0,0 +1,40 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>morphology</artifactId>
<groupId>org.apache.lucene.morphology</groupId>
<version>1.3-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<groupId>org.apache.lucene.analysis</groupId>
<artifactId>morphology</artifactId>
<name>solr-morphology-analysis</name>
<version>${morphology.version}</version>
<url>http://maven.apache.org</url>
<dependencies>
<dependency>
<groupId>org.apache.lucene.morphology</groupId>
<artifactId>russian</artifactId>
<version>${morphology.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene.morphology</groupId>
<artifactId>english</artifactId>
<version>${morphology.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,69 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.morphology;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.morphology.LuceneMorphology;
import org.apache.lucene.morphology.analyzer.MorphologyFilter;
import java.util.Map;
/**
* Factory for {@link MorphologyFilter}, with configurable language
* <p>
* <b>Note:</b> Two languages are available now: English (default value) and Russian.
* <pre class="prettyprint">
* &lt;fieldType name="content" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.MorphologyFilterFactory" language="English"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class MorphologyFilterFactory extends TokenFilterFactory implements ResourceLoaderAware{
private static final String LANGUAGE_KEY = "language";
private String language;
private LuceneMorphology luceneMorphology;
public MorphologyFilterFactory(Map<String, String> args) {
super(args);
language = get(args, LANGUAGE_KEY, "English");
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
public TokenStream create(TokenStream input) {
return new MorphologyFilter(input, luceneMorphology);
}
public void inform(ResourceLoader loader) {
String className = "org.apache.lucene.morphology." + language.toLowerCase() + "." + language + "LuceneMorphology";
luceneMorphology = loader.newInstance(className, LuceneMorphology.class);
}
public LuceneMorphology getLuceneMorphology() {
return luceneMorphology;
}
}

View File

@ -0,0 +1,75 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.morphology;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.morphology.LuceneMorphology;
import org.apache.lucene.morphology.english.EnglishLuceneMorphology;
import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.util.HashMap;
import java.util.Map;
public class MorphologyFilterFactoryTest {
private static final String LANGUAGE_KEY = "language";
private ResourceLoader loader = new ClasspathResourceLoader();
private Map<String, String> args;
@Before
public void setUp(){
args = new HashMap<>();
}
@Test
public void if_RussianLanguageKey_then_CreateRussianMorphologyFilter(){
args.put(LANGUAGE_KEY, "Russian");
MorphologyFilterFactory morphologyFilterFactory = new MorphologyFilterFactory(args);
morphologyFilterFactory.inform(loader);
LuceneMorphology luceneMorphology = morphologyFilterFactory.getLuceneMorphology();
Assert.assertTrue("Creation the MorphologyFilterFactory with a Russian language key", luceneMorphology instanceof RussianLuceneMorphology);
}
@Test
public void if_EnglishLanguageKey_then_CreateEnglishMorphologyFilter(){
args.put(LANGUAGE_KEY, "English");
MorphologyFilterFactory morphologyFilterFactory = new MorphologyFilterFactory(args);
morphologyFilterFactory.inform(loader);
LuceneMorphology luceneMorphology = morphologyFilterFactory.getLuceneMorphology();
Assert.assertTrue("Creation the MorphologyFilterFactory with a English language key", luceneMorphology instanceof EnglishLuceneMorphology);
}
@Test
public void if_NoLanguageKey_then_CreateEnglishMorphologyFilter(){
MorphologyFilterFactory morphologyFilterFactory = new MorphologyFilterFactory(args);
morphologyFilterFactory.inform(loader);
LuceneMorphology luceneMorphology = morphologyFilterFactory.getLuceneMorphology();
Assert.assertTrue("Creation the MorphologyFilterFactory without any language keys", luceneMorphology instanceof EnglishLuceneMorphology);
}
}