Improve incrementToken implementation in MorphologyFilter
The current implementation doesn't preserve the attributes of the repeated tokens and stems tokens marked as keywords.
This commit is contained in:
parent
6ca2b27781
commit
db144bf2ec
@ -16,8 +16,15 @@
|
|||||||
package org.apache.lucene.morphology;
|
package org.apache.lucene.morphology;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer;
|
import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer;
|
||||||
import org.apache.lucene.morphology.analyzer.MorphologyFilter;
|
import org.apache.lucene.morphology.analyzer.MorphologyFilter;
|
||||||
import org.apache.lucene.morphology.english.EnglishAnalyzer;
|
import org.apache.lucene.morphology.english.EnglishAnalyzer;
|
||||||
@ -31,10 +38,9 @@ import java.io.*;
|
|||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
import static org.hamcrest.Matchers.equalTo;
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
import static org.junit.Assert.assertThat;
|
|
||||||
|
|
||||||
|
|
||||||
public class AnalyzersTest {
|
public class AnalyzersTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void shouldGiveCorrectWordsForEnglish() throws IOException {
|
public void shouldGiveCorrectWordsForEnglish() throws IOException {
|
||||||
@ -116,4 +122,44 @@ public class AnalyzersTest {
|
|||||||
|
|
||||||
assertThat(result, equalTo(answer));
|
assertThat(result, equalTo(answer));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPositionIncrement() throws IOException {
|
||||||
|
EnglishAnalyzer englishAnalyzer = new EnglishAnalyzer();
|
||||||
|
assertTokenStreamContents(
|
||||||
|
englishAnalyzer.tokenStream("test", "There are tests!"),
|
||||||
|
new String[]{"there", "are", "be", "test"},
|
||||||
|
new int[]{0, 6, 6, 10},
|
||||||
|
new int[]{5, 9, 9, 15},
|
||||||
|
new String[]{"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>"},
|
||||||
|
new int[]{1, 1, 0, 1}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testKeywordHandling() throws IOException {
|
||||||
|
Analyzer analyzer = new EnglishKeywordTestAnalyzer();
|
||||||
|
assertTokenStreamContents(
|
||||||
|
analyzer.tokenStream("test", "Tests shouldn't be stemmed, but tests should!"),
|
||||||
|
new String[]{"tests", "shouldn't", "be", "stem", "but", "test", "shall"}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class EnglishKeywordTestAnalyzer extends Analyzer {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String s) {
|
||||||
|
StandardTokenizer src = new StandardTokenizer();
|
||||||
|
TokenFilter filter = new StandardFilter(src);
|
||||||
|
CharArraySet dontStem = new CharArraySet(1, false);
|
||||||
|
dontStem.add("Tests");
|
||||||
|
filter = new SetKeywordMarkerFilter(filter, dontStem);
|
||||||
|
filter = new LowerCaseFilter(filter);
|
||||||
|
try {
|
||||||
|
filter = new MorphologyFilter(filter, new EnglishLuceneMorphology());
|
||||||
|
} catch (IOException ex) {
|
||||||
|
throw new RuntimeException("cannot create EnglishLuceneMorphology", ex);
|
||||||
|
}
|
||||||
|
return new TokenStreamComponents(src, filter);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -19,11 +19,5 @@
|
|||||||
<version>1.2-SNAPSHOT</version>
|
<version>1.2-SNAPSHOT</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>junit</groupId>
|
|
||||||
<artifactId>junit</artifactId>
|
|
||||||
<version>4.8.2</version>
|
|
||||||
<scope>test</scope>
|
|
||||||
</dependency>
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</project>
|
</project>
|
@ -19,18 +19,22 @@ package org.apache.lucene.morphology.analyzer;
|
|||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.morphology.LuceneMorphology;
|
import org.apache.lucene.morphology.LuceneMorphology;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
public class MorphologyFilter extends TokenFilter {
|
public class MorphologyFilter extends TokenFilter {
|
||||||
private LuceneMorphology luceneMorph;
|
private LuceneMorphology luceneMorph;
|
||||||
private Iterator<String> iterator;
|
private Iterator<String> iterator;
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
private final PositionIncrementAttribute position = addAttribute(PositionIncrementAttribute.class);
|
private final PositionIncrementAttribute position = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
private State state = null;
|
||||||
|
|
||||||
public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
|
public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
|
||||||
super(tokenStream);
|
super(tokenStream);
|
||||||
@ -39,27 +43,39 @@ public class MorphologyFilter extends TokenFilter {
|
|||||||
|
|
||||||
|
|
||||||
final public boolean incrementToken() throws IOException {
|
final public boolean incrementToken() throws IOException {
|
||||||
boolean oldToken = true;
|
if (iterator != null) {
|
||||||
while (iterator == null || !iterator.hasNext()) {
|
if (iterator.hasNext()) {
|
||||||
|
restoreState(state);
|
||||||
|
position.setPositionIncrement(0);
|
||||||
|
termAtt.setEmpty().append(iterator.next());
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
state = null;
|
||||||
|
iterator = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
while (true) {
|
||||||
boolean b = input.incrementToken();
|
boolean b = input.incrementToken();
|
||||||
if (!b) {
|
if (!b) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (!keywordAttr.isKeyword() && termAtt.length() > 0) {
|
||||||
String s = new String(termAtt.buffer(), 0, termAtt.length());
|
String s = new String(termAtt.buffer(), 0, termAtt.length());
|
||||||
if (luceneMorph.checkString(s)) {
|
if (luceneMorph.checkString(s)) {
|
||||||
oldToken = false;
|
List<String> forms = luceneMorph.getNormalForms(s);
|
||||||
iterator = luceneMorph.getNormalForms(s).iterator();
|
if (forms.isEmpty()) {
|
||||||
|
continue;
|
||||||
|
} else if (forms.size() == 1) {
|
||||||
|
termAtt.setEmpty().append(forms.get(0));
|
||||||
} else {
|
} else {
|
||||||
return true;
|
state = captureState();
|
||||||
|
iterator = forms.iterator();
|
||||||
|
termAtt.setEmpty().append(iterator.next());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
String s = iterator.next();
|
|
||||||
termAtt.setEmpty();
|
|
||||||
termAtt.append(s);
|
|
||||||
if (oldToken) {
|
|
||||||
position.setPositionIncrement(0);
|
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
14
pom.xml
14
pom.xml
@ -15,6 +15,10 @@
|
|||||||
<tag>HEAD</tag>
|
<tag>HEAD</tag>
|
||||||
</scm>
|
</scm>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<lucene.version>5.1.0</lucene.version>
|
||||||
|
</properties>
|
||||||
|
|
||||||
<distributionManagement>
|
<distributionManagement>
|
||||||
<repository>
|
<repository>
|
||||||
<id>bintray</id>
|
<id>bintray</id>
|
||||||
@ -24,9 +28,9 @@
|
|||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>junit</groupId>
|
<groupId>org.apache.lucene</groupId>
|
||||||
<artifactId>junit</artifactId>
|
<artifactId>lucene-test-framework</artifactId>
|
||||||
<version>4.8.2</version>
|
<version>${lucene.version}</version>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
@ -38,12 +42,12 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.lucene</groupId>
|
<groupId>org.apache.lucene</groupId>
|
||||||
<artifactId>lucene-core</artifactId>
|
<artifactId>lucene-core</artifactId>
|
||||||
<version>5.1.0</version>
|
<version>${lucene.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.lucene</groupId>
|
<groupId>org.apache.lucene</groupId>
|
||||||
<artifactId>lucene-analyzers-common</artifactId>
|
<artifactId>lucene-analyzers-common</artifactId>
|
||||||
<version>5.1.0</version>
|
<version>${lucene.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user