diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..34e1547 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +target +.idea +*.iml \ No newline at end of file diff --git a/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java b/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java index a37bfa7..073210d 100644 --- a/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java +++ b/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java @@ -19,7 +19,6 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.morphology.english.EnglishAnalyzer; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.morphology.russian.RussianAnalyzer; import org.junit.Test; @@ -66,6 +65,7 @@ public class AnalyzersTest { InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader); + tokenStream.reset(); HashSet result = new HashSet(); while (tokenStream.incrementToken()) { CharTermAttribute attribute1 = tokenStream.getAttribute(CharTermAttribute.class); diff --git a/dictionary-reader/src/test/java/org/apache/lucene/morphology/LuceneMorphTest.java b/dictionary-reader/src/test/java/org/apache/lucene/morphology/LuceneMorphTest.java index 2faff21..838a7e7 100644 --- a/dictionary-reader/src/test/java/org/apache/lucene/morphology/LuceneMorphTest.java +++ b/dictionary-reader/src/test/java/org/apache/lucene/morphology/LuceneMorphTest.java @@ -25,6 +25,7 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.util.Arrays; import java.util.HashSet; +import java.util.List; import java.util.Set; import static org.hamcrest.CoreMatchers.equalTo; @@ -43,6 +44,8 @@ public class LuceneMorphTest { @Test public void russianMorphologyShouldGetCorrectNormalForm() throws IOException { LuceneMorphology luceneMorph = new RussianLuceneMorphology(); + List v = luceneMorph.getMorphInfo("вина"); + System.out.println(v); String pathToTestData = "/russian/russian-morphology-test.txt"; testMorphology(luceneMorph, pathToTestData); } diff --git a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java index 012cfa7..6b59a64 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java +++ b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java @@ -17,12 +17,17 @@ package org.apache.lucene.morphology.analyzer; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.LowerCaseFilter; -import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; +import org.apache.lucene.analysis.payloads.PayloadEncoder; +import org.apache.lucene.analysis.payloads.PayloadHelper; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.morphology.LetterDecoderEncoder; import org.apache.lucene.morphology.LuceneMorphology; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Version; import java.io.IOException; @@ -44,15 +49,39 @@ public class MorphologyAnalyzer extends Analyzer { luceneMorph = new LuceneMorphology(inputStream, letterDecoderEncoder); } - final public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new StandardTokenizer(Version.LUCENE_35, reader); - result = new StandardFilter(Version.LUCENE_35,result); - result = new LowerCaseFilter(Version.LUCENE_35,result); - return new MorphologyFilter(result, luceneMorph); - } @Override - final public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { - return super.reusableTokenStream(fieldName, reader); + protected TokenStreamComponents createComponents(String s) { + + StandardTokenizer src = new StandardTokenizer(); + final PayloadEncoder encoder = new PayloadEncoder() { + @Override + public BytesRef encode(char[] buffer) { + final Float payload = Float.valueOf(new String(buffer)); + System.out.println(payload); + final byte[] bytes = PayloadHelper.encodeFloat(payload); + return new BytesRef(bytes, 0, bytes.length); + } + + @Override + public BytesRef encode(char[] buffer, int offset, int length) { + + final Float payload = Float.valueOf(new String(buffer, offset, length)); + System.out.println(payload); + final byte[] bytes = PayloadHelper.encodeFloat(payload); + + return new BytesRef(bytes, 0, bytes.length); + } + }; + TokenFilter filter = new StandardFilter(src); + filter = new LowerCaseFilter(filter); + filter = new MorphologyFilter(filter, luceneMorph); + + return new TokenStreamComponents(src, filter) { + @Override + protected void setReader(final Reader reader) throws IOException { + super.setReader(reader); + } + }; } } diff --git a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyFilter.java b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyFilter.java index 33fa031..9d38c3b 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyFilter.java +++ b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyFilter.java @@ -19,7 +19,6 @@ package org.apache.lucene.morphology.analyzer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.morphology.LuceneMorphology; import java.io.IOException; diff --git a/pom.xml b/pom.xml index cb265f7..dcee143 100644 --- a/pom.xml +++ b/pom.xml @@ -49,8 +49,15 @@ org.apache.lucene lucene-core - 3.5.0 + 5.0.0 + + + org.apache.lucene + lucene-analyzers-common + 5.0.0 + +