diff --git a/english/src/test/java/org/apache/lucene/morphology/english/EnglishAnalayzerTest.java b/english/src/test/java/org/apache/lucene/morphology/english/EnglishAnalayzerTest.java index a95b9f2..c74f9aa 100644 --- a/english/src/test/java/org/apache/lucene/morphology/english/EnglishAnalayzerTest.java +++ b/english/src/test/java/org/apache/lucene/morphology/english/EnglishAnalayzerTest.java @@ -15,8 +15,8 @@ */ package org.apache.lucene.morphology.english; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import static org.hamcrest.Matchers.equalTo; import static org.junit.Assert.assertThat; import org.junit.Test; @@ -43,19 +43,12 @@ public class EnglishAnalayzerTest { stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-data.txt"); InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); - final Token reusableToken = new Token(); - Token nextToken; - TokenStream in = morphlogyAnalayzer.tokenStream(null, reader); + TokenStream tokenStream = morphlogyAnalayzer.tokenStream(null, reader); HashSet result = new HashSet(); - for (; ;) { - nextToken = in.next(reusableToken); - - if (nextToken == null) { - break; - } - - result.add(nextToken.term()); + while (tokenStream.incrementToken()) { + TermAttribute attribute1 = tokenStream.getAttribute(TermAttribute.class); + result.add(attribute1.term()); } stream.close(); diff --git a/morph/pom.xml b/morph/pom.xml index 2c39edc..4709ecf 100644 --- a/morph/pom.xml +++ b/morph/pom.xml @@ -1,5 +1,6 @@ - + morphology org.apache.lucene.morphology @@ -13,11 +14,6 @@ http://maven.apache.org - - junit - junit - 3.8.1 - test - + diff --git a/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyAnalayzer.java b/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyAnalayzer.java index 2f35533..47ef528 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyAnalayzer.java +++ b/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyAnalayzer.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.morphology.LetterDecoderEncoder; import org.apache.lucene.morphology.LuceneMorphology; +import org.apache.lucene.util.Version; import java.io.IOException; import java.io.InputStream; @@ -44,7 +45,7 @@ public class MorphlogyAnalayzer extends Analyzer { } public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new StandardTokenizer(reader); + TokenStream result = new StandardTokenizer(Version.LUCENE_30, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); return new MorphlogyFilter(result, luceneMorph); diff --git a/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyFilter.java b/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyFilter.java index 40c396a..641bc9b 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyFilter.java +++ b/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyFilter.java @@ -16,67 +16,43 @@ package org.apache.lucene.morphology.analayzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.morphology.LuceneMorphology; import java.io.IOException; -import java.util.ArrayList; -import java.util.List; +import java.util.Iterator; public class MorphlogyFilter extends TokenFilter { private LuceneMorphology luceneMorph; + private Iterator iterator; + private TermAttribute termAtt; public MorphlogyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) { super(tokenStream); this.luceneMorph = luceneMorph; + termAtt = addAttribute(TermAttribute.class); } - private List stack = new ArrayList(); - private int index = 0; - private Token current = null; - - /** - * Returns the next token in the stream, or null at EOS. - */ - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - while (index < stack.size()) { // pop from stack - Token nextToken = createToken(stack.get(index++), current, reusableToken); - if (nextToken != null) { - return nextToken; + public boolean incrementToken() throws IOException { + while (iterator == null || !iterator.hasNext()) { + boolean b = input.incrementToken(); + if (!b) { + return false; + } + String s = termAtt.term(); + if (luceneMorph.checkString(s)) { + iterator = luceneMorph.getNormalForms(termAtt.term()).iterator(); + } else { + return true; } } - - Token nextToken = input.next(reusableToken); - if (nextToken == null) return null; // EOS; iterator exhausted - if (!luceneMorph.checkString(nextToken.term())) { - return nextToken; - } - stack = luceneMorph.getNormalForms(nextToken.term()); - index = 0; - current = (Token) nextToken.clone(); - nextToken = createToken(stack.get(index++), current, reusableToken); - return nextToken; + String s = iterator.next(); + termAtt.setTermBuffer(s); + return true; } - /** - * Creates and returns a token for the given synonym of the current input - * token; Override for custom (stateless or stateful) behavior, if desired. - * - * @param synonym a synonym for the current token's term - * @param current the current token from the underlying child stream - * @param reusableToken the token to reuse - * @return a new token, or null to indicate that the given synonym should be - * ignored - */ - protected Token createToken(String synonym, Token current, final Token reusableToken) { - reusableToken.reinit(current, synonym); - reusableToken.setTermBuffer(synonym); - reusableToken.setPositionIncrement(0); - return reusableToken; - } } diff --git a/pom.xml b/pom.xml index 76bc3f0..cd8dcd1 100644 --- a/pom.xml +++ b/pom.xml @@ -1,5 +1,6 @@ - + 4.0.0 org.apache.lucene.morphology morphology @@ -49,7 +50,7 @@ org.apache.lucene lucene-core - 2.4.1 + 3.0.0 diff --git a/russian/src/test/java/org/apache/lucene/morphology/russian/RussianAnalayzerTest.java b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianAnalayzerTest.java index 2982de6..bec1462 100644 --- a/russian/src/test/java/org/apache/lucene/morphology/russian/RussianAnalayzerTest.java +++ b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianAnalayzerTest.java @@ -15,8 +15,8 @@ */ package org.apache.lucene.morphology.russian; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import static org.hamcrest.Matchers.equalTo; import static org.junit.Assert.assertThat; import org.junit.Test; @@ -43,21 +43,12 @@ public class RussianAnalayzerTest { stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-data.txt"); InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); - final Token reusableToken = new Token(); - Token nextToken; - TokenStream in = morphlogyAnalayzer.tokenStream(null, reader); + TokenStream tokenStream = morphlogyAnalayzer.tokenStream(null, reader); HashSet result = new HashSet(); - for (; ;) { - nextToken = in.next(reusableToken); - - if (nextToken == null) { - break; - } - - result.add(nextToken.term()); - // - + while (tokenStream.incrementToken()) { + TermAttribute attribute1 = tokenStream.getAttribute(TermAttribute.class); + result.add(attribute1.term()); } stream.close();