Improve incrementToken implementation in MorphologyFilter
The current implementation doesn't preserve the attributes of the repeated tokens and stems tokens marked as keywords.
This commit is contained in:
@ -19,18 +19,22 @@ package org.apache.lucene.morphology.analyzer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.morphology.LuceneMorphology;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class MorphologyFilter extends TokenFilter {
|
||||
private LuceneMorphology luceneMorph;
|
||||
private Iterator<String> iterator;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
private final PositionIncrementAttribute position = addAttribute(PositionIncrementAttribute.class);
|
||||
private State state = null;
|
||||
|
||||
public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
|
||||
super(tokenStream);
|
||||
@ -39,27 +43,39 @@ public class MorphologyFilter extends TokenFilter {
|
||||
|
||||
|
||||
final public boolean incrementToken() throws IOException {
|
||||
boolean oldToken = true;
|
||||
while (iterator == null || !iterator.hasNext()) {
|
||||
if (iterator != null) {
|
||||
if (iterator.hasNext()) {
|
||||
restoreState(state);
|
||||
position.setPositionIncrement(0);
|
||||
termAtt.setEmpty().append(iterator.next());
|
||||
return true;
|
||||
} else {
|
||||
state = null;
|
||||
iterator = null;
|
||||
}
|
||||
}
|
||||
while (true) {
|
||||
boolean b = input.incrementToken();
|
||||
if (!b) {
|
||||
return false;
|
||||
}
|
||||
String s = new String(termAtt.buffer(), 0, termAtt.length());
|
||||
if (luceneMorph.checkString(s)) {
|
||||
oldToken = false;
|
||||
iterator = luceneMorph.getNormalForms(s).iterator();
|
||||
} else {
|
||||
return true;
|
||||
if (!keywordAttr.isKeyword() && termAtt.length() > 0) {
|
||||
String s = new String(termAtt.buffer(), 0, termAtt.length());
|
||||
if (luceneMorph.checkString(s)) {
|
||||
List<String> forms = luceneMorph.getNormalForms(s);
|
||||
if (forms.isEmpty()) {
|
||||
continue;
|
||||
} else if (forms.size() == 1) {
|
||||
termAtt.setEmpty().append(forms.get(0));
|
||||
} else {
|
||||
state = captureState();
|
||||
iterator = forms.iterator();
|
||||
termAtt.setEmpty().append(iterator.next());
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
String s = iterator.next();
|
||||
termAtt.setEmpty();
|
||||
termAtt.append(s);
|
||||
if (oldToken) {
|
||||
position.setPositionIncrement(0);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
Reference in New Issue
Block a user