Improve incrementToken implementation in MorphologyFilter

The current implementation doesn't preserve the attributes of the repeated tokens and stems tokens marked as keywords.
This commit is contained in:
Igor Motov 2015-07-03 16:48:02 -04:00
parent 6ca2b27781
commit db144bf2ec
4 changed files with 88 additions and 28 deletions

View File

@ -16,8 +16,15 @@
package org.apache.lucene.morphology; package org.apache.lucene.morphology;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer; import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer;
import org.apache.lucene.morphology.analyzer.MorphologyFilter; import org.apache.lucene.morphology.analyzer.MorphologyFilter;
import org.apache.lucene.morphology.english.EnglishAnalyzer; import org.apache.lucene.morphology.english.EnglishAnalyzer;
@ -31,10 +38,9 @@ import java.io.*;
import java.util.*; import java.util.*;
import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.equalTo;
import static org.junit.Assert.assertThat;
public class AnalyzersTest { public class AnalyzersTest extends BaseTokenStreamTestCase {
@Test @Test
public void shouldGiveCorrectWordsForEnglish() throws IOException { public void shouldGiveCorrectWordsForEnglish() throws IOException {
@ -116,4 +122,44 @@ public class AnalyzersTest {
assertThat(result, equalTo(answer)); assertThat(result, equalTo(answer));
} }
@Test
public void testPositionIncrement() throws IOException {
EnglishAnalyzer englishAnalyzer = new EnglishAnalyzer();
assertTokenStreamContents(
englishAnalyzer.tokenStream("test", "There are tests!"),
new String[]{"there", "are", "be", "test"},
new int[]{0, 6, 6, 10},
new int[]{5, 9, 9, 15},
new String[]{"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>"},
new int[]{1, 1, 0, 1}
);
}
@Test
public void testKeywordHandling() throws IOException {
Analyzer analyzer = new EnglishKeywordTestAnalyzer();
assertTokenStreamContents(
analyzer.tokenStream("test", "Tests shouldn't be stemmed, but tests should!"),
new String[]{"tests", "shouldn't", "be", "stem", "but", "test", "shall"}
);
}
private static class EnglishKeywordTestAnalyzer extends Analyzer {
@Override
protected TokenStreamComponents createComponents(String s) {
StandardTokenizer src = new StandardTokenizer();
TokenFilter filter = new StandardFilter(src);
CharArraySet dontStem = new CharArraySet(1, false);
dontStem.add("Tests");
filter = new SetKeywordMarkerFilter(filter, dontStem);
filter = new LowerCaseFilter(filter);
try {
filter = new MorphologyFilter(filter, new EnglishLuceneMorphology());
} catch (IOException ex) {
throw new RuntimeException("cannot create EnglishLuceneMorphology", ex);
}
return new TokenStreamComponents(src, filter);
}
}
} }

View File

@ -19,11 +19,5 @@
<version>1.2-SNAPSHOT</version> <version>1.2-SNAPSHOT</version>
</dependency> </dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.8.2</version>
<scope>test</scope>
</dependency>
</dependencies> </dependencies>
</project> </project>

View File

@ -19,18 +19,22 @@ package org.apache.lucene.morphology.analyzer;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.morphology.LuceneMorphology; import org.apache.lucene.morphology.LuceneMorphology;
import java.io.IOException; import java.io.IOException;
import java.util.Iterator; import java.util.Iterator;
import java.util.List;
public class MorphologyFilter extends TokenFilter { public class MorphologyFilter extends TokenFilter {
private LuceneMorphology luceneMorph; private LuceneMorphology luceneMorph;
private Iterator<String> iterator; private Iterator<String> iterator;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
private final PositionIncrementAttribute position = addAttribute(PositionIncrementAttribute.class); private final PositionIncrementAttribute position = addAttribute(PositionIncrementAttribute.class);
private State state = null;
public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) { public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
super(tokenStream); super(tokenStream);
@ -39,27 +43,39 @@ public class MorphologyFilter extends TokenFilter {
final public boolean incrementToken() throws IOException { final public boolean incrementToken() throws IOException {
boolean oldToken = true; if (iterator != null) {
while (iterator == null || !iterator.hasNext()) { if (iterator.hasNext()) {
restoreState(state);
position.setPositionIncrement(0);
termAtt.setEmpty().append(iterator.next());
return true;
} else {
state = null;
iterator = null;
}
}
while (true) {
boolean b = input.incrementToken(); boolean b = input.incrementToken();
if (!b) { if (!b) {
return false; return false;
} }
if (!keywordAttr.isKeyword() && termAtt.length() > 0) {
String s = new String(termAtt.buffer(), 0, termAtt.length()); String s = new String(termAtt.buffer(), 0, termAtt.length());
if (luceneMorph.checkString(s)) { if (luceneMorph.checkString(s)) {
oldToken = false; List<String> forms = luceneMorph.getNormalForms(s);
iterator = luceneMorph.getNormalForms(s).iterator(); if (forms.isEmpty()) {
continue;
} else if (forms.size() == 1) {
termAtt.setEmpty().append(forms.get(0));
} else { } else {
return true; state = captureState();
iterator = forms.iterator();
termAtt.setEmpty().append(iterator.next());
} }
} }
String s = iterator.next();
termAtt.setEmpty();
termAtt.append(s);
if (oldToken) {
position.setPositionIncrement(0);
} }
return true; return true;
} }
}
} }

14
pom.xml
View File

@ -15,6 +15,10 @@
<tag>HEAD</tag> <tag>HEAD</tag>
</scm> </scm>
<properties>
<lucene.version>5.1.0</lucene.version>
</properties>
<distributionManagement> <distributionManagement>
<repository> <repository>
<id>bintray</id> <id>bintray</id>
@ -24,9 +28,9 @@
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>junit</groupId> <groupId>org.apache.lucene</groupId>
<artifactId>junit</artifactId> <artifactId>lucene-test-framework</artifactId>
<version>4.8.2</version> <version>${lucene.version}</version>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>
<dependency> <dependency>
@ -38,12 +42,12 @@
<dependency> <dependency>
<groupId>org.apache.lucene</groupId> <groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId> <artifactId>lucene-core</artifactId>
<version>5.1.0</version> <version>${lucene.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.lucene</groupId> <groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId> <artifactId>lucene-analyzers-common</artifactId>
<version>5.1.0</version> <version>${lucene.version}</version>
</dependency> </dependency>
</dependencies> </dependencies>