diff --git a/english/src/main/java/org/apache/lucene/morphology/english/stemmer/EnglishStemmer.java b/english/src/main/java/org/apache/lucene/morphology/english/stemmer/EnglishStemmer.java new file mode 100644 index 0000000..346ea45 --- /dev/null +++ b/english/src/main/java/org/apache/lucene/morphology/english/stemmer/EnglishStemmer.java @@ -0,0 +1,23 @@ +package org.apache.lucene.morphology.english.stemmer; + + +import org.apache.lucene.morphology.english.EnglishLuceneMorphology; + +import java.util.List; + +public class EnglishStemmer { + private EnglishLuceneMorphology englishLuceneMorphology; + + public String getStemmedWord(String word){ + List normalForms = englishLuceneMorphology.getNormalForms(word); + if(normalForms.size() == 1){ + return normalForms.get(0); + } + normalForms.remove(word); + if(normalForms.size() == 1){ + return normalForms.get(0); + } + return word; + } + +} diff --git a/english/src/main/java/org/apache/lucene/morphology/english/stemmer/EnglishStemmerFilter.java b/english/src/main/java/org/apache/lucene/morphology/english/stemmer/EnglishStemmerFilter.java new file mode 100644 index 0000000..c7ff722 --- /dev/null +++ b/english/src/main/java/org/apache/lucene/morphology/english/stemmer/EnglishStemmerFilter.java @@ -0,0 +1,34 @@ +package org.apache.lucene.morphology.english.stemmer; + + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.morphology.LuceneMorphology; + +import java.io.IOException; +import java.util.Iterator; + +public class EnglishStemmerFilter extends TokenFilter { + private EnglishStemmer englishStemmer; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + + public EnglishStemmerFilter(TokenStream input, EnglishStemmer englishStemmer) { + super(input); + this.englishStemmer = englishStemmer; + } + + + final public boolean incrementToken() throws IOException { + + boolean b = input.incrementToken(); + if (!b) { + return false; + } + String s = new String(termAtt.buffer(), 0, termAtt.length()); + termAtt.setEmpty(); + termAtt.append(s); + return true; + } + +} \ No newline at end of file diff --git a/english/src/main/resources/org/apache/lucene/morphology/english/exceptions.txt b/english/src/main/resources/org/apache/lucene/morphology/english/exceptions.txt new file mode 100644 index 0000000..e69de29 diff --git a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java index 0be508d..012cfa7 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java +++ b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java @@ -44,10 +44,15 @@ public class MorphologyAnalyzer extends Analyzer { luceneMorph = new LuceneMorphology(inputStream, letterDecoderEncoder); } - public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new StandardTokenizer(Version.LUCENE_30, reader); - result = new StandardFilter(result); - result = new LowerCaseFilter(result); + final public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new StandardTokenizer(Version.LUCENE_35, reader); + result = new StandardFilter(Version.LUCENE_35,result); + result = new LowerCaseFilter(Version.LUCENE_35,result); return new MorphologyFilter(result, luceneMorph); } + + @Override + final public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { + return super.reusableTokenStream(fieldName, reader); + } } diff --git a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyFilter.java b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyFilter.java index 38bb9f6..33fa031 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyFilter.java +++ b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyFilter.java @@ -18,6 +18,7 @@ package org.apache.lucene.morphology.analyzer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.morphology.LuceneMorphology; @@ -28,30 +29,31 @@ import java.util.Iterator; public class MorphologyFilter extends TokenFilter { private LuceneMorphology luceneMorph; private Iterator iterator; - private TermAttribute termAtt; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) { super(tokenStream); this.luceneMorph = luceneMorph; - termAtt = addAttribute(TermAttribute.class); } - public boolean incrementToken() throws IOException { + final public boolean incrementToken() throws IOException { while (iterator == null || !iterator.hasNext()) { boolean b = input.incrementToken(); if (!b) { return false; } - String s = termAtt.term(); + String s = new String(termAtt.buffer(),0,termAtt.length()); if (luceneMorph.checkString(s)) { - iterator = luceneMorph.getNormalForms(termAtt.term()).iterator(); + iterator = luceneMorph.getNormalForms(s).iterator(); } else { return true; } } String s = iterator.next(); - termAtt.setTermBuffer(s); + termAtt.setEmpty(); + termAtt.append(s); return true; } diff --git a/pom.xml b/pom.xml index 121f4ea..9bca627 100644 --- a/pom.xml +++ b/pom.xml @@ -49,7 +49,7 @@ org.apache.lucene lucene-core - 3.0.0 + 3.5.0