Alexander Kuznetsov 2015-03-21 01:28:06 +03:00
parent 3e69baa332
commit 1323228212
3 changed files with 30 additions and 4 deletions

View File

@ -18,9 +18,13 @@ package org.apache.lucene.morphology;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer;
import org.apache.lucene.morphology.analyzer.MorphologyFilter;
import org.apache.lucene.morphology.english.EnglishAnalyzer; import org.apache.lucene.morphology.english.EnglishAnalyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.morphology.english.EnglishLuceneMorphology;
import org.apache.lucene.morphology.russian.RussianAnalyzer; import org.apache.lucene.morphology.russian.RussianAnalyzer;
import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
import org.junit.Test; import org.junit.Test;
import java.io.*; import java.io.*;
@ -50,6 +54,22 @@ public class AnalyzersTest {
testAnalayzer(morphlogyAnalyzer, answerPath, testPath); testAnalayzer(morphlogyAnalyzer, answerPath, testPath);
} }
@Test
public void emptyStringTest() throws IOException {
LuceneMorphology russianLuceneMorphology = new RussianLuceneMorphology();
LuceneMorphology englishLuceneMorphology = new EnglishLuceneMorphology();
MorphologyAnalyzer russianAnalyzer = new MorphologyAnalyzer(russianLuceneMorphology);
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("тест пм тест".getBytes()), "UTF-8");
TokenStream stream = russianAnalyzer.tokenStream(null, reader);
MorphologyFilter englishFilter = new MorphologyFilter(stream, englishLuceneMorphology);
englishFilter.reset();
while (englishFilter.incrementToken()) {
System.out.println(englishFilter.toString());
}
}
@Test @Test
public void shouldProvideCorrectIndentForWordWithMelitaForm() throws IOException { public void shouldProvideCorrectIndentForWordWithMelitaForm() throws IOException {
Analyzer morphlogyAnalyzer = new RussianAnalyzer(); Analyzer morphlogyAnalyzer = new RussianAnalyzer();

View File

@ -50,8 +50,15 @@ public class MorphologyImpl implements Morphology {
ArrayList<String> result = new ArrayList<String>(); ArrayList<String> result = new ArrayList<String>();
int[] ints = decoderEncoder.encodeToArray(revertWord(s)); int[] ints = decoderEncoder.encodeToArray(revertWord(s));
int ruleId = findRuleId(ints); int ruleId = findRuleId(ints);
boolean notSeenEmptyString = true;
for (Heuristic h : rules[rulesId[ruleId]]) { for (Heuristic h : rules[rulesId[ruleId]]) {
result.add(h.transformWord(s).toString()); String e = h.transformWord(s).toString();
if (e.length() > 0) {
result.add(e);
} else if (notSeenEmptyString) {
result.add(s);
notSeenEmptyString = false;
}
} }
return result; return result;
} }

View File

@ -19,8 +19,6 @@ package org.apache.lucene.morphology.analyzer;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
import org.apache.lucene.analysis.payloads.PayloadEncoder; import org.apache.lucene.analysis.payloads.PayloadEncoder;
import org.apache.lucene.analysis.payloads.PayloadHelper; import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardFilter;
@ -28,7 +26,6 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.morphology.LetterDecoderEncoder; import org.apache.lucene.morphology.LetterDecoderEncoder;
import org.apache.lucene.morphology.LuceneMorphology; import org.apache.lucene.morphology.LuceneMorphology;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
@ -84,4 +81,6 @@ public class MorphologyAnalyzer extends Analyzer {
} }
}; };
} }
} }