Merge pull request #1 from imotov/fix-positions-increments-and-keyword-handling
Improve incrementToken implementation in MorphologyFilter
This commit is contained in:
commit
bda49aad18
@ -16,8 +16,15 @@
|
||||
package org.apache.lucene.morphology;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer;
|
||||
import org.apache.lucene.morphology.analyzer.MorphologyFilter;
|
||||
import org.apache.lucene.morphology.english.EnglishAnalyzer;
|
||||
@ -31,10 +38,9 @@ import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.junit.Assert.assertThat;
|
||||
|
||||
|
||||
public class AnalyzersTest {
|
||||
public class AnalyzersTest extends BaseTokenStreamTestCase {
|
||||
|
||||
@Test
|
||||
public void shouldGiveCorrectWordsForEnglish() throws IOException {
|
||||
@ -116,4 +122,44 @@ public class AnalyzersTest {
|
||||
|
||||
assertThat(result, equalTo(answer));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPositionIncrement() throws IOException {
|
||||
EnglishAnalyzer englishAnalyzer = new EnglishAnalyzer();
|
||||
assertTokenStreamContents(
|
||||
englishAnalyzer.tokenStream("test", "There are tests!"),
|
||||
new String[]{"there", "are", "be", "test"},
|
||||
new int[]{0, 6, 6, 10},
|
||||
new int[]{5, 9, 9, 15},
|
||||
new String[]{"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>"},
|
||||
new int[]{1, 1, 0, 1}
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testKeywordHandling() throws IOException {
|
||||
Analyzer analyzer = new EnglishKeywordTestAnalyzer();
|
||||
assertTokenStreamContents(
|
||||
analyzer.tokenStream("test", "Tests shouldn't be stemmed, but tests should!"),
|
||||
new String[]{"tests", "shouldn't", "be", "stem", "but", "test", "shall"}
|
||||
);
|
||||
}
|
||||
|
||||
private static class EnglishKeywordTestAnalyzer extends Analyzer {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String s) {
|
||||
StandardTokenizer src = new StandardTokenizer();
|
||||
TokenFilter filter = new StandardFilter(src);
|
||||
CharArraySet dontStem = new CharArraySet(1, false);
|
||||
dontStem.add("Tests");
|
||||
filter = new SetKeywordMarkerFilter(filter, dontStem);
|
||||
filter = new LowerCaseFilter(filter);
|
||||
try {
|
||||
filter = new MorphologyFilter(filter, new EnglishLuceneMorphology());
|
||||
} catch (IOException ex) {
|
||||
throw new RuntimeException("cannot create EnglishLuceneMorphology", ex);
|
||||
}
|
||||
return new TokenStreamComponents(src, filter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -19,11 +19,5 @@
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.8.2</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
@ -19,18 +19,22 @@ package org.apache.lucene.morphology.analyzer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.morphology.LuceneMorphology;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class MorphologyFilter extends TokenFilter {
|
||||
private LuceneMorphology luceneMorph;
|
||||
private Iterator<String> iterator;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
private final PositionIncrementAttribute position = addAttribute(PositionIncrementAttribute.class);
|
||||
private State state = null;
|
||||
|
||||
public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
|
||||
super(tokenStream);
|
||||
@ -39,27 +43,39 @@ public class MorphologyFilter extends TokenFilter {
|
||||
|
||||
|
||||
final public boolean incrementToken() throws IOException {
|
||||
boolean oldToken = true;
|
||||
while (iterator == null || !iterator.hasNext()) {
|
||||
if (iterator != null) {
|
||||
if (iterator.hasNext()) {
|
||||
restoreState(state);
|
||||
position.setPositionIncrement(0);
|
||||
termAtt.setEmpty().append(iterator.next());
|
||||
return true;
|
||||
} else {
|
||||
state = null;
|
||||
iterator = null;
|
||||
}
|
||||
}
|
||||
while (true) {
|
||||
boolean b = input.incrementToken();
|
||||
if (!b) {
|
||||
return false;
|
||||
}
|
||||
if (!keywordAttr.isKeyword() && termAtt.length() > 0) {
|
||||
String s = new String(termAtt.buffer(), 0, termAtt.length());
|
||||
if (luceneMorph.checkString(s)) {
|
||||
oldToken = false;
|
||||
iterator = luceneMorph.getNormalForms(s).iterator();
|
||||
List<String> forms = luceneMorph.getNormalForms(s);
|
||||
if (forms.isEmpty()) {
|
||||
continue;
|
||||
} else if (forms.size() == 1) {
|
||||
termAtt.setEmpty().append(forms.get(0));
|
||||
} else {
|
||||
return true;
|
||||
state = captureState();
|
||||
iterator = forms.iterator();
|
||||
termAtt.setEmpty().append(iterator.next());
|
||||
}
|
||||
}
|
||||
String s = iterator.next();
|
||||
termAtt.setEmpty();
|
||||
termAtt.append(s);
|
||||
if (oldToken) {
|
||||
position.setPositionIncrement(0);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
14
pom.xml
14
pom.xml
@ -15,6 +15,10 @@
|
||||
<tag>HEAD</tag>
|
||||
</scm>
|
||||
|
||||
<properties>
|
||||
<lucene.version>5.1.0</lucene.version>
|
||||
</properties>
|
||||
|
||||
<distributionManagement>
|
||||
<repository>
|
||||
<id>bintray</id>
|
||||
@ -24,9 +28,9 @@
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.8.2</version>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-test-framework</artifactId>
|
||||
<version>${lucene.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
@ -38,12 +42,12 @@
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-core</artifactId>
|
||||
<version>5.1.0</version>
|
||||
<version>${lucene.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-analyzers-common</artifactId>
|
||||
<version>5.1.0</version>
|
||||
<version>${lucene.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user