From 13232282128d34a81b5a48d1b98bc62b9c5e48ce Mon Sep 17 00:00:00 2001 From: Alexander Kuznetsov Date: Sat, 21 Mar 2015 01:28:06 +0300 Subject: [PATCH] Fixing issue https://code.google.com/p/russianmorphology/issues/detail?id=12 --- .../lucene/morphology/AnalyzersTest.java | 20 +++++++++++++++++++ .../lucene/morphology/MorphologyImpl.java | 9 ++++++++- .../analyzer/MorphologyAnalyzer.java | 5 ++--- 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java b/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java index 7cd5bd0..f8f182c 100644 --- a/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java +++ b/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java @@ -18,9 +18,13 @@ package org.apache.lucene.morphology; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer; +import org.apache.lucene.morphology.analyzer.MorphologyFilter; import org.apache.lucene.morphology.english.EnglishAnalyzer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.morphology.english.EnglishLuceneMorphology; import org.apache.lucene.morphology.russian.RussianAnalyzer; +import org.apache.lucene.morphology.russian.RussianLuceneMorphology; import org.junit.Test; import java.io.*; @@ -50,6 +54,22 @@ public class AnalyzersTest { testAnalayzer(morphlogyAnalyzer, answerPath, testPath); } + @Test + public void emptyStringTest() throws IOException { + LuceneMorphology russianLuceneMorphology = new RussianLuceneMorphology(); + LuceneMorphology englishLuceneMorphology = new EnglishLuceneMorphology(); + + MorphologyAnalyzer russianAnalyzer = new MorphologyAnalyzer(russianLuceneMorphology); + InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("тест пм тест".getBytes()), "UTF-8"); + TokenStream stream = russianAnalyzer.tokenStream(null, reader); + MorphologyFilter englishFilter = new MorphologyFilter(stream, englishLuceneMorphology); + + englishFilter.reset(); + while (englishFilter.incrementToken()) { + System.out.println(englishFilter.toString()); + } + } + @Test public void shouldProvideCorrectIndentForWordWithMelitaForm() throws IOException { Analyzer morphlogyAnalyzer = new RussianAnalyzer(); diff --git a/morph/src/main/java/org/apache/lucene/morphology/MorphologyImpl.java b/morph/src/main/java/org/apache/lucene/morphology/MorphologyImpl.java index b1e7580..9a12d2b 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/MorphologyImpl.java +++ b/morph/src/main/java/org/apache/lucene/morphology/MorphologyImpl.java @@ -50,8 +50,15 @@ public class MorphologyImpl implements Morphology { ArrayList result = new ArrayList(); int[] ints = decoderEncoder.encodeToArray(revertWord(s)); int ruleId = findRuleId(ints); + boolean notSeenEmptyString = true; for (Heuristic h : rules[rulesId[ruleId]]) { - result.add(h.transformWord(s).toString()); + String e = h.transformWord(s).toString(); + if (e.length() > 0) { + result.add(e); + } else if (notSeenEmptyString) { + result.add(s); + notSeenEmptyString = false; + } } return result; } diff --git a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java index 6b59a64..0e9bec1 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java +++ b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java @@ -19,8 +19,6 @@ package org.apache.lucene.morphology.analyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.core.LowerCaseFilter; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; -import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; import org.apache.lucene.analysis.payloads.PayloadEncoder; import org.apache.lucene.analysis.payloads.PayloadHelper; import org.apache.lucene.analysis.standard.StandardFilter; @@ -28,7 +26,6 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.morphology.LetterDecoderEncoder; import org.apache.lucene.morphology.LuceneMorphology; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.Version; import java.io.IOException; import java.io.InputStream; @@ -84,4 +81,6 @@ public class MorphologyAnalyzer extends Analyzer { } }; } + + }