Alexander Kuznetsov 2015-03-21 01:08:18 +03:00
parent 9b793115f7
commit 3e69baa332
2 changed files with 37 additions and 11 deletions

View File

@ -17,17 +17,14 @@ package org.apache.lucene.morphology;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.morphology.english.EnglishAnalyzer; import org.apache.lucene.morphology.english.EnglishAnalyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.morphology.russian.RussianAnalyzer; import org.apache.lucene.morphology.russian.RussianAnalyzer;
import org.junit.Test; import org.junit.Test;
import java.io.BufferedReader; import java.io.*;
import java.io.IOException; import java.util.*;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashSet;
import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.equalTo;
import static org.junit.Assert.assertThat; import static org.junit.Assert.assertThat;
@ -36,7 +33,7 @@ import static org.junit.Assert.assertThat;
public class AnalyzersTest { public class AnalyzersTest {
@Test @Test
public void englishAnalyzerShouldGiveCorrectWords() throws IOException { public void shouldGiveCorrectWordsForEnglish() throws IOException {
Analyzer morphlogyAnalyzer = new EnglishAnalyzer(); Analyzer morphlogyAnalyzer = new EnglishAnalyzer();
String answerPath = "/english/english-analyzer-answer.txt"; String answerPath = "/english/english-analyzer-answer.txt";
String testPath = "/english/english-analyzer-data.txt"; String testPath = "/english/english-analyzer-data.txt";
@ -45,7 +42,7 @@ public class AnalyzersTest {
} }
@Test @Test
public void shoudGiveCorretWords() throws IOException { public void shouldGiveCorrectWordsForRussian() throws IOException {
Analyzer morphlogyAnalyzer = new RussianAnalyzer(); Analyzer morphlogyAnalyzer = new RussianAnalyzer();
String answerPath = "/russian/russian-analyzer-answer.txt"; String answerPath = "/russian/russian-analyzer-answer.txt";
String testPath = "/russian/russian-analyzer-data.txt"; String testPath = "/russian/russian-analyzer-data.txt";
@ -53,6 +50,29 @@ public class AnalyzersTest {
testAnalayzer(morphlogyAnalyzer, answerPath, testPath); testAnalayzer(morphlogyAnalyzer, answerPath, testPath);
} }
@Test
public void shouldProvideCorrectIndentForWordWithMelitaForm() throws IOException {
Analyzer morphlogyAnalyzer = new RussianAnalyzer();
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), "UTF-8");
TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
tokenStream.reset();
Set<String> foromsOfWine = new HashSet<String>();
foromsOfWine.add("вина");
foromsOfWine.add("винo");
boolean wordSeen = false;
while (tokenStream.incrementToken()) {
CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
if(foromsOfWine.contains(charTerm.toString()) && wordSeen){
assertThat(position.getPositionIncrement(),equalTo(0));
}
if(foromsOfWine.contains(charTerm.toString())){
wordSeen = true;
}
}
}
private void testAnalayzer(Analyzer morphlogyAnalyzer, String answerPath, String testPath) throws IOException { private void testAnalayzer(Analyzer morphlogyAnalyzer, String answerPath, String testPath) throws IOException {
InputStream stream = this.getClass().getResourceAsStream(answerPath); InputStream stream = this.getClass().getResourceAsStream(answerPath);
BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));

View File

@ -19,6 +19,7 @@ package org.apache.lucene.morphology.analyzer;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.morphology.LuceneMorphology; import org.apache.lucene.morphology.LuceneMorphology;
import java.io.IOException; import java.io.IOException;
@ -29,7 +30,7 @@ public class MorphologyFilter extends TokenFilter {
private LuceneMorphology luceneMorph; private LuceneMorphology luceneMorph;
private Iterator<String> iterator; private Iterator<String> iterator;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute position = addAttribute(PositionIncrementAttribute.class);
public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) { public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
super(tokenStream); super(tokenStream);
@ -38,6 +39,7 @@ public class MorphologyFilter extends TokenFilter {
final public boolean incrementToken() throws IOException { final public boolean incrementToken() throws IOException {
boolean oldToken = true;
while (iterator == null || !iterator.hasNext()) { while (iterator == null || !iterator.hasNext()) {
boolean b = input.incrementToken(); boolean b = input.incrementToken();
if (!b) { if (!b) {
@ -45,6 +47,7 @@ public class MorphologyFilter extends TokenFilter {
} }
String s = new String(termAtt.buffer(), 0, termAtt.length()); String s = new String(termAtt.buffer(), 0, termAtt.length());
if (luceneMorph.checkString(s)) { if (luceneMorph.checkString(s)) {
oldToken = false;
iterator = luceneMorph.getNormalForms(s).iterator(); iterator = luceneMorph.getNormalForms(s).iterator();
} else { } else {
return true; return true;
@ -53,6 +56,9 @@ public class MorphologyFilter extends TokenFilter {
String s = iterator.next(); String s = iterator.next();
termAtt.setEmpty(); termAtt.setEmpty();
termAtt.append(s); termAtt.append(s);
if (oldToken) {
position.setPositionIncrement(0);
}
return true; return true;
} }