From 5c7c6297460b78f523966a7a80d105a5372f8efc Mon Sep 17 00:00:00 2001 From: "alexander.a.kuznetsov" Date: Sun, 12 Apr 2009 19:16:05 +0000 Subject: [PATCH] working on analayzer and test git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@9 d817d54c-26ab-11de-abc9-2f7d1455ff7a --- .../morphology/analayzer/ArrayEvristics.java | 2 +- .../analayzer/RussianMorphlogyAnalayzer.java | 19 +++++++++++++++++++ .../analayzer/RussianMorphlogyFilter.java | 5 +++-- .../morphology}/russianSuffixesEvristics.txt | 0 .../RussianSuffixDecoderEncoderTest.java | 2 +- .../analayzer/ArrayEvristicsTest.java | 12 ++++++++++++ .../analayzer}/decoder-test-data.txt | 0 7 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzer.java rename src/main/resources/{ => org/apache/lucene/russian/morphology}/russianSuffixesEvristics.txt (100%) create mode 100644 src/test/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristicsTest.java rename src/test/resources/{ => org/apache/lucene/russian/morphology/analayzer}/decoder-test-data.txt (100%) diff --git a/src/main/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristics.java b/src/main/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristics.java index 28ee763..565607a 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristics.java +++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristics.java @@ -25,7 +25,7 @@ public class ArrayEvristics { } public void readFromResource() throws IOException { - InputStream stream = this.getClass().getResourceAsStream("/russianSuffixesEvristics.txt"); + InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/russianSuffixesEvristics.txt"); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream)); readFromBufferedRreader(bufferedReader); } diff --git a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzer.java b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzer.java new file mode 100644 index 0000000..c5f3e15 --- /dev/null +++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzer.java @@ -0,0 +1,19 @@ +package org.apache.lucene.russian.morphology.analayzer; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; + +import java.io.Reader; +import java.io.IOException; + +public class RussianMorphlogyAnalayzer extends Analyzer { + private ArrayEvristics arrayEvristics; + + public RussianMorphlogyAnalayzer() throws IOException { + arrayEvristics = new ArrayEvristics(); + } + + public TokenStream tokenStream(String fieldName, Reader reader) { + return null; //To change body of implemented methods use File | Settings | File Templates. + } +} diff --git a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java index 21ea5be..fd63b0e 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java +++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java @@ -18,12 +18,13 @@ public class RussianMorphlogyFilter extends TokenFilter { public Token next(final Token reusableToken) throws IOException { Token nextToken = input.next(reusableToken); if(nextToken == null || nextToken.term().length() == 0) return nextToken; - Character testC = nextToken.term().charAt(0); + String word = nextToken.term().toLowerCase(); + Character testC = word.charAt(0); if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC){ return nextToken; } Token current = (Token) nextToken.clone(); - return createToken(arrayEvristics.getCanonicalForm(nextToken.term()), current, reusableToken); + return createToken(arrayEvristics.getCanonicalForm(word), current, reusableToken); } protected Token createToken(String synonym, Token current, final Token reusableToken) { diff --git a/src/main/resources/russianSuffixesEvristics.txt b/src/main/resources/org/apache/lucene/russian/morphology/russianSuffixesEvristics.txt similarity index 100% rename from src/main/resources/russianSuffixesEvristics.txt rename to src/main/resources/org/apache/lucene/russian/morphology/russianSuffixesEvristics.txt diff --git a/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java b/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java index d6be88f..ce7aca4 100644 --- a/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java +++ b/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java @@ -15,7 +15,7 @@ public class RussianSuffixDecoderEncoderTest { @Test public void testShouldCorretDecodeEncode() throws IOException { - InputStream stream = this.getClass().getResourceAsStream("/decoder-test-data.txt"); + InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/decoder-test-data.txt"); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream)); String s = bufferedReader.readLine(); while(s != null){ diff --git a/src/test/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristicsTest.java b/src/test/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristicsTest.java new file mode 100644 index 0000000..b9ade50 --- /dev/null +++ b/src/test/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristicsTest.java @@ -0,0 +1,12 @@ +package org.apache.lucene.russian.morphology.analayzer; + +import org.junit.Test; + + +public class ArrayEvristicsTest { + + @Test + public void testShouldDefineCorretCononicalWordForm(){ + + } +} diff --git a/src/test/resources/decoder-test-data.txt b/src/test/resources/org/apache/lucene/russian/morphology/analayzer/decoder-test-data.txt similarity index 100% rename from src/test/resources/decoder-test-data.txt rename to src/test/resources/org/apache/lucene/russian/morphology/analayzer/decoder-test-data.txt