From 595b25ab6572de647422c669465508f331267d9e Mon Sep 17 00:00:00 2001 From: "alexander.a.kuznetsov" Date: Sat, 11 Apr 2009 21:03:18 +0000 Subject: [PATCH] fixed filter git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@5 d817d54c-26ab-11de-abc9-2f7d1455ff7a --- .../morphology/analayzer/RussianMorphlogyFilter.java | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java index 972467b..21ea5be 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java +++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java @@ -10,14 +10,20 @@ import java.io.IOException; public class RussianMorphlogyFilter extends TokenFilter { private ArrayEvristics arrayEvristics; - protected RussianMorphlogyFilter(TokenStream tokenStream, ArrayEvristics arrayEvristics) { + public RussianMorphlogyFilter(TokenStream tokenStream, ArrayEvristics arrayEvristics) throws IOException { super(tokenStream); this.arrayEvristics = arrayEvristics; } public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - return createToken(arrayEvristics.getCanonicalForm(reusableToken.term()), reusableToken, reusableToken); + Token nextToken = input.next(reusableToken); + if(nextToken == null || nextToken.term().length() == 0) return nextToken; + Character testC = nextToken.term().charAt(0); + if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC){ + return nextToken; + } + Token current = (Token) nextToken.clone(); + return createToken(arrayEvristics.getCanonicalForm(nextToken.term()), current, reusableToken); } protected Token createToken(String synonym, Token current, final Token reusableToken) {