adding support of lucene 3.5 and start working english stemmer
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@123 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
d46651f2ba
commit
466de768ca
@ -0,0 +1,23 @@
|
|||||||
|
package org.apache.lucene.morphology.english.stemmer;
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.morphology.english.EnglishLuceneMorphology;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class EnglishStemmer {
|
||||||
|
private EnglishLuceneMorphology englishLuceneMorphology;
|
||||||
|
|
||||||
|
public String getStemmedWord(String word){
|
||||||
|
List<String> normalForms = englishLuceneMorphology.getNormalForms(word);
|
||||||
|
if(normalForms.size() == 1){
|
||||||
|
return normalForms.get(0);
|
||||||
|
}
|
||||||
|
normalForms.remove(word);
|
||||||
|
if(normalForms.size() == 1){
|
||||||
|
return normalForms.get(0);
|
||||||
|
}
|
||||||
|
return word;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,34 @@
|
|||||||
|
package org.apache.lucene.morphology.english.stemmer;
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.morphology.LuceneMorphology;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
public class EnglishStemmerFilter extends TokenFilter {
|
||||||
|
private EnglishStemmer englishStemmer;
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
|
public EnglishStemmerFilter(TokenStream input, EnglishStemmer englishStemmer) {
|
||||||
|
super(input);
|
||||||
|
this.englishStemmer = englishStemmer;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
final public boolean incrementToken() throws IOException {
|
||||||
|
|
||||||
|
boolean b = input.incrementToken();
|
||||||
|
if (!b) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
String s = new String(termAtt.buffer(), 0, termAtt.length());
|
||||||
|
termAtt.setEmpty();
|
||||||
|
termAtt.append(s);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -44,10 +44,15 @@ public class MorphologyAnalyzer extends Analyzer {
|
|||||||
luceneMorph = new LuceneMorphology(inputStream, letterDecoderEncoder);
|
luceneMorph = new LuceneMorphology(inputStream, letterDecoderEncoder);
|
||||||
}
|
}
|
||||||
|
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
final public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
TokenStream result = new StandardTokenizer(Version.LUCENE_30, reader);
|
TokenStream result = new StandardTokenizer(Version.LUCENE_35, reader);
|
||||||
result = new StandardFilter(result);
|
result = new StandardFilter(Version.LUCENE_35,result);
|
||||||
result = new LowerCaseFilter(result);
|
result = new LowerCaseFilter(Version.LUCENE_35,result);
|
||||||
return new MorphologyFilter(result, luceneMorph);
|
return new MorphologyFilter(result, luceneMorph);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
final public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||||
|
return super.reusableTokenStream(fieldName, reader);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,6 +18,7 @@ package org.apache.lucene.morphology.analyzer;
|
|||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.morphology.LuceneMorphology;
|
import org.apache.lucene.morphology.LuceneMorphology;
|
||||||
|
|
||||||
@ -28,30 +29,31 @@ import java.util.Iterator;
|
|||||||
public class MorphologyFilter extends TokenFilter {
|
public class MorphologyFilter extends TokenFilter {
|
||||||
private LuceneMorphology luceneMorph;
|
private LuceneMorphology luceneMorph;
|
||||||
private Iterator<String> iterator;
|
private Iterator<String> iterator;
|
||||||
private TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
|
|
||||||
public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
|
public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
|
||||||
super(tokenStream);
|
super(tokenStream);
|
||||||
this.luceneMorph = luceneMorph;
|
this.luceneMorph = luceneMorph;
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean incrementToken() throws IOException {
|
final public boolean incrementToken() throws IOException {
|
||||||
while (iterator == null || !iterator.hasNext()) {
|
while (iterator == null || !iterator.hasNext()) {
|
||||||
boolean b = input.incrementToken();
|
boolean b = input.incrementToken();
|
||||||
if (!b) {
|
if (!b) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
String s = termAtt.term();
|
String s = new String(termAtt.buffer(),0,termAtt.length());
|
||||||
if (luceneMorph.checkString(s)) {
|
if (luceneMorph.checkString(s)) {
|
||||||
iterator = luceneMorph.getNormalForms(termAtt.term()).iterator();
|
iterator = luceneMorph.getNormalForms(s).iterator();
|
||||||
} else {
|
} else {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
String s = iterator.next();
|
String s = iterator.next();
|
||||||
termAtt.setTermBuffer(s);
|
termAtt.setEmpty();
|
||||||
|
termAtt.append(s);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
2
pom.xml
2
pom.xml
@ -49,7 +49,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.lucene</groupId>
|
<groupId>org.apache.lucene</groupId>
|
||||||
<artifactId>lucene-core</artifactId>
|
<artifactId>lucene-core</artifactId>
|
||||||
<version>3.0.0</version>
|
<version>3.5.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user