diff --git a/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java b/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java index f8f182c..4b702df 100644 --- a/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java +++ b/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java @@ -16,8 +16,15 @@ package org.apache.lucene.morphology; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer; import org.apache.lucene.morphology.analyzer.MorphologyFilter; import org.apache.lucene.morphology.english.EnglishAnalyzer; @@ -31,10 +38,9 @@ import java.io.*; import java.util.*; import static org.hamcrest.Matchers.equalTo; -import static org.junit.Assert.assertThat; -public class AnalyzersTest { +public class AnalyzersTest extends BaseTokenStreamTestCase { @Test public void shouldGiveCorrectWordsForEnglish() throws IOException { @@ -116,4 +122,44 @@ public class AnalyzersTest { assertThat(result, equalTo(answer)); } + + @Test + public void testPositionIncrement() throws IOException { + EnglishAnalyzer englishAnalyzer = new EnglishAnalyzer(); + assertTokenStreamContents( + englishAnalyzer.tokenStream("test", "There are tests!"), + new String[]{"there", "are", "be", "test"}, + new int[]{0, 6, 6, 10}, + new int[]{5, 9, 9, 15}, + new String[]{"", "", "", ""}, + new int[]{1, 1, 0, 1} + ); + } + + @Test + public void testKeywordHandling() throws IOException { + Analyzer analyzer = new EnglishKeywordTestAnalyzer(); + assertTokenStreamContents( + analyzer.tokenStream("test", "Tests shouldn't be stemmed, but tests should!"), + new String[]{"tests", "shouldn't", "be", "stem", "but", "test", "shall"} + ); + } + + private static class EnglishKeywordTestAnalyzer extends Analyzer { + @Override + protected TokenStreamComponents createComponents(String s) { + StandardTokenizer src = new StandardTokenizer(); + TokenFilter filter = new StandardFilter(src); + CharArraySet dontStem = new CharArraySet(1, false); + dontStem.add("Tests"); + filter = new SetKeywordMarkerFilter(filter, dontStem); + filter = new LowerCaseFilter(filter); + try { + filter = new MorphologyFilter(filter, new EnglishLuceneMorphology()); + } catch (IOException ex) { + throw new RuntimeException("cannot create EnglishLuceneMorphology", ex); + } + return new TokenStreamComponents(src, filter); + } + } } diff --git a/english/pom.xml b/english/pom.xml index 602c030..1daa52e 100644 --- a/english/pom.xml +++ b/english/pom.xml @@ -19,11 +19,5 @@ 1.2-SNAPSHOT - - junit - junit - 4.8.2 - test - \ No newline at end of file diff --git a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyFilter.java b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyFilter.java index 5af6c1d..fa84cd6 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyFilter.java +++ b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyFilter.java @@ -19,18 +19,22 @@ package org.apache.lucene.morphology.analyzer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.morphology.LuceneMorphology; import java.io.IOException; import java.util.Iterator; +import java.util.List; public class MorphologyFilter extends TokenFilter { private LuceneMorphology luceneMorph; private Iterator iterator; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); private final PositionIncrementAttribute position = addAttribute(PositionIncrementAttribute.class); + private State state = null; public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) { super(tokenStream); @@ -39,27 +43,39 @@ public class MorphologyFilter extends TokenFilter { final public boolean incrementToken() throws IOException { - boolean oldToken = true; - while (iterator == null || !iterator.hasNext()) { + if (iterator != null) { + if (iterator.hasNext()) { + restoreState(state); + position.setPositionIncrement(0); + termAtt.setEmpty().append(iterator.next()); + return true; + } else { + state = null; + iterator = null; + } + } + while (true) { boolean b = input.incrementToken(); if (!b) { return false; } - String s = new String(termAtt.buffer(), 0, termAtt.length()); - if (luceneMorph.checkString(s)) { - oldToken = false; - iterator = luceneMorph.getNormalForms(s).iterator(); - } else { - return true; + if (!keywordAttr.isKeyword() && termAtt.length() > 0) { + String s = new String(termAtt.buffer(), 0, termAtt.length()); + if (luceneMorph.checkString(s)) { + List forms = luceneMorph.getNormalForms(s); + if (forms.isEmpty()) { + continue; + } else if (forms.size() == 1) { + termAtt.setEmpty().append(forms.get(0)); + } else { + state = captureState(); + iterator = forms.iterator(); + termAtt.setEmpty().append(iterator.next()); + } + } } + return true; } - String s = iterator.next(); - termAtt.setEmpty(); - termAtt.append(s); - if (oldToken) { - position.setPositionIncrement(0); - } - return true; } } diff --git a/pom.xml b/pom.xml index e6f2f0f..fdd1236 100644 --- a/pom.xml +++ b/pom.xml @@ -15,6 +15,10 @@ HEAD + + 5.1.0 + + bintray @@ -24,9 +28,9 @@ - junit - junit - 4.8.2 + org.apache.lucene + lucene-test-framework + ${lucene.version} test @@ -38,12 +42,12 @@ org.apache.lucene lucene-core - 5.1.0 + ${lucene.version} org.apache.lucene lucene-analyzers-common - 5.1.0 + ${lucene.version}