This commit is contained in:
parent
9b793115f7
commit
3e69baa332
@ -17,17 +17,14 @@ package org.apache.lucene.morphology;
|
|||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.morphology.english.EnglishAnalyzer;
|
import org.apache.lucene.morphology.english.EnglishAnalyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.morphology.russian.RussianAnalyzer;
|
import org.apache.lucene.morphology.russian.RussianAnalyzer;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.*;
|
||||||
import java.io.IOException;
|
import java.util.*;
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashSet;
|
|
||||||
|
|
||||||
import static org.hamcrest.Matchers.equalTo;
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
import static org.junit.Assert.assertThat;
|
import static org.junit.Assert.assertThat;
|
||||||
@ -36,7 +33,7 @@ import static org.junit.Assert.assertThat;
|
|||||||
public class AnalyzersTest {
|
public class AnalyzersTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void englishAnalyzerShouldGiveCorrectWords() throws IOException {
|
public void shouldGiveCorrectWordsForEnglish() throws IOException {
|
||||||
Analyzer morphlogyAnalyzer = new EnglishAnalyzer();
|
Analyzer morphlogyAnalyzer = new EnglishAnalyzer();
|
||||||
String answerPath = "/english/english-analyzer-answer.txt";
|
String answerPath = "/english/english-analyzer-answer.txt";
|
||||||
String testPath = "/english/english-analyzer-data.txt";
|
String testPath = "/english/english-analyzer-data.txt";
|
||||||
@ -45,7 +42,7 @@ public class AnalyzersTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void shoudGiveCorretWords() throws IOException {
|
public void shouldGiveCorrectWordsForRussian() throws IOException {
|
||||||
Analyzer morphlogyAnalyzer = new RussianAnalyzer();
|
Analyzer morphlogyAnalyzer = new RussianAnalyzer();
|
||||||
String answerPath = "/russian/russian-analyzer-answer.txt";
|
String answerPath = "/russian/russian-analyzer-answer.txt";
|
||||||
String testPath = "/russian/russian-analyzer-data.txt";
|
String testPath = "/russian/russian-analyzer-data.txt";
|
||||||
@ -53,6 +50,29 @@ public class AnalyzersTest {
|
|||||||
testAnalayzer(morphlogyAnalyzer, answerPath, testPath);
|
testAnalayzer(morphlogyAnalyzer, answerPath, testPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shouldProvideCorrectIndentForWordWithMelitaForm() throws IOException {
|
||||||
|
Analyzer morphlogyAnalyzer = new RussianAnalyzer();
|
||||||
|
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), "UTF-8");
|
||||||
|
|
||||||
|
TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
|
||||||
|
tokenStream.reset();
|
||||||
|
Set<String> foromsOfWine = new HashSet<String>();
|
||||||
|
foromsOfWine.add("вина");
|
||||||
|
foromsOfWine.add("винo");
|
||||||
|
boolean wordSeen = false;
|
||||||
|
while (tokenStream.incrementToken()) {
|
||||||
|
CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
|
||||||
|
PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
|
||||||
|
if(foromsOfWine.contains(charTerm.toString()) && wordSeen){
|
||||||
|
assertThat(position.getPositionIncrement(),equalTo(0));
|
||||||
|
}
|
||||||
|
if(foromsOfWine.contains(charTerm.toString())){
|
||||||
|
wordSeen = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private void testAnalayzer(Analyzer morphlogyAnalyzer, String answerPath, String testPath) throws IOException {
|
private void testAnalayzer(Analyzer morphlogyAnalyzer, String answerPath, String testPath) throws IOException {
|
||||||
InputStream stream = this.getClass().getResourceAsStream(answerPath);
|
InputStream stream = this.getClass().getResourceAsStream(answerPath);
|
||||||
BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||||
|
@ -19,6 +19,7 @@ package org.apache.lucene.morphology.analyzer;
|
|||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.morphology.LuceneMorphology;
|
import org.apache.lucene.morphology.LuceneMorphology;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -29,7 +30,7 @@ public class MorphologyFilter extends TokenFilter {
|
|||||||
private LuceneMorphology luceneMorph;
|
private LuceneMorphology luceneMorph;
|
||||||
private Iterator<String> iterator;
|
private Iterator<String> iterator;
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final PositionIncrementAttribute position = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
|
public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
|
||||||
super(tokenStream);
|
super(tokenStream);
|
||||||
@ -38,6 +39,7 @@ public class MorphologyFilter extends TokenFilter {
|
|||||||
|
|
||||||
|
|
||||||
final public boolean incrementToken() throws IOException {
|
final public boolean incrementToken() throws IOException {
|
||||||
|
boolean oldToken = true;
|
||||||
while (iterator == null || !iterator.hasNext()) {
|
while (iterator == null || !iterator.hasNext()) {
|
||||||
boolean b = input.incrementToken();
|
boolean b = input.incrementToken();
|
||||||
if (!b) {
|
if (!b) {
|
||||||
@ -45,6 +47,7 @@ public class MorphologyFilter extends TokenFilter {
|
|||||||
}
|
}
|
||||||
String s = new String(termAtt.buffer(), 0, termAtt.length());
|
String s = new String(termAtt.buffer(), 0, termAtt.length());
|
||||||
if (luceneMorph.checkString(s)) {
|
if (luceneMorph.checkString(s)) {
|
||||||
|
oldToken = false;
|
||||||
iterator = luceneMorph.getNormalForms(s).iterator();
|
iterator = luceneMorph.getNormalForms(s).iterator();
|
||||||
} else {
|
} else {
|
||||||
return true;
|
return true;
|
||||||
@ -53,6 +56,9 @@ public class MorphologyFilter extends TokenFilter {
|
|||||||
String s = iterator.next();
|
String s = iterator.next();
|
||||||
termAtt.setEmpty();
|
termAtt.setEmpty();
|
||||||
termAtt.append(s);
|
termAtt.append(s);
|
||||||
|
if (oldToken) {
|
||||||
|
position.setPositionIncrement(0);
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user