adding support of lucene 3.5 and start working english stemmer
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@123 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
@ -0,0 +1,23 @@
|
||||
package org.apache.lucene.morphology.english.stemmer;
|
||||
|
||||
|
||||
import org.apache.lucene.morphology.english.EnglishLuceneMorphology;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class EnglishStemmer {
|
||||
private EnglishLuceneMorphology englishLuceneMorphology;
|
||||
|
||||
public String getStemmedWord(String word){
|
||||
List<String> normalForms = englishLuceneMorphology.getNormalForms(word);
|
||||
if(normalForms.size() == 1){
|
||||
return normalForms.get(0);
|
||||
}
|
||||
normalForms.remove(word);
|
||||
if(normalForms.size() == 1){
|
||||
return normalForms.get(0);
|
||||
}
|
||||
return word;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,34 @@
|
||||
package org.apache.lucene.morphology.english.stemmer;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.morphology.LuceneMorphology;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
public class EnglishStemmerFilter extends TokenFilter {
|
||||
private EnglishStemmer englishStemmer;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
public EnglishStemmerFilter(TokenStream input, EnglishStemmer englishStemmer) {
|
||||
super(input);
|
||||
this.englishStemmer = englishStemmer;
|
||||
}
|
||||
|
||||
|
||||
final public boolean incrementToken() throws IOException {
|
||||
|
||||
boolean b = input.incrementToken();
|
||||
if (!b) {
|
||||
return false;
|
||||
}
|
||||
String s = new String(termAtt.buffer(), 0, termAtt.length());
|
||||
termAtt.setEmpty();
|
||||
termAtt.append(s);
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
Reference in New Issue
Block a user