From 3e69baa3320372d49c64f917dacaf7e51ebd4a9f Mon Sep 17 00:00:00 2001 From: Alexander Kuznetsov Date: Sat, 21 Mar 2015 01:08:18 +0300 Subject: [PATCH] Fixing issue https://code.google.com/p/russianmorphology/issues/detail?id=11 --- .../lucene/morphology/AnalyzersTest.java | 36 ++++++++++++++----- .../morphology/analyzer/MorphologyFilter.java | 12 +++++-- 2 files changed, 37 insertions(+), 11 deletions(-) diff --git a/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java b/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java index 073210d..7cd5bd0 100644 --- a/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java +++ b/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java @@ -17,17 +17,14 @@ package org.apache.lucene.morphology; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.morphology.english.EnglishAnalyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.morphology.russian.RussianAnalyzer; import org.junit.Test; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.util.Arrays; -import java.util.HashSet; +import java.io.*; +import java.util.*; import static org.hamcrest.Matchers.equalTo; import static org.junit.Assert.assertThat; @@ -36,7 +33,7 @@ import static org.junit.Assert.assertThat; public class AnalyzersTest { @Test - public void englishAnalyzerShouldGiveCorrectWords() throws IOException { + public void shouldGiveCorrectWordsForEnglish() throws IOException { Analyzer morphlogyAnalyzer = new EnglishAnalyzer(); String answerPath = "/english/english-analyzer-answer.txt"; String testPath = "/english/english-analyzer-data.txt"; @@ -45,7 +42,7 @@ public class AnalyzersTest { } @Test - public void shoudGiveCorretWords() throws IOException { + public void shouldGiveCorrectWordsForRussian() throws IOException { Analyzer morphlogyAnalyzer = new RussianAnalyzer(); String answerPath = "/russian/russian-analyzer-answer.txt"; String testPath = "/russian/russian-analyzer-data.txt"; @@ -53,6 +50,29 @@ public class AnalyzersTest { testAnalayzer(morphlogyAnalyzer, answerPath, testPath); } + @Test + public void shouldProvideCorrectIndentForWordWithMelitaForm() throws IOException { + Analyzer morphlogyAnalyzer = new RussianAnalyzer(); + InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), "UTF-8"); + + TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader); + tokenStream.reset(); + Set foromsOfWine = new HashSet(); + foromsOfWine.add("вина"); + foromsOfWine.add("винo"); + boolean wordSeen = false; + while (tokenStream.incrementToken()) { + CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); + PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class); + if(foromsOfWine.contains(charTerm.toString()) && wordSeen){ + assertThat(position.getPositionIncrement(),equalTo(0)); + } + if(foromsOfWine.contains(charTerm.toString())){ + wordSeen = true; + } + } + } + private void testAnalayzer(Analyzer morphlogyAnalyzer, String answerPath, String testPath) throws IOException { InputStream stream = this.getClass().getResourceAsStream(answerPath); BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); diff --git a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyFilter.java b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyFilter.java index 9d38c3b..5af6c1d 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyFilter.java +++ b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyFilter.java @@ -19,6 +19,7 @@ package org.apache.lucene.morphology.analyzer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.morphology.LuceneMorphology; import java.io.IOException; @@ -29,7 +30,7 @@ public class MorphologyFilter extends TokenFilter { private LuceneMorphology luceneMorph; private Iterator iterator; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - + private final PositionIncrementAttribute position = addAttribute(PositionIncrementAttribute.class); public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) { super(tokenStream); @@ -37,14 +38,16 @@ public class MorphologyFilter extends TokenFilter { } - final public boolean incrementToken() throws IOException { + final public boolean incrementToken() throws IOException { + boolean oldToken = true; while (iterator == null || !iterator.hasNext()) { boolean b = input.incrementToken(); if (!b) { return false; } - String s = new String(termAtt.buffer(),0,termAtt.length()); + String s = new String(termAtt.buffer(), 0, termAtt.length()); if (luceneMorph.checkString(s)) { + oldToken = false; iterator = luceneMorph.getNormalForms(s).iterator(); } else { return true; @@ -53,6 +56,9 @@ public class MorphologyFilter extends TokenFilter { String s = iterator.next(); termAtt.setEmpty(); termAtt.append(s); + if (oldToken) { + position.setPositionIncrement(0); + } return true; }