From 48ae7d4cb6de9270c09e1d224993b54762552b4c Mon Sep 17 00:00:00 2001 From: "alexander.a.kuznetsov" Date: Mon, 13 Apr 2009 14:04:53 +0000 Subject: [PATCH] test for suffix evristics git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@11 d817d54c-26ab-11de-abc9-2f7d1455ff7a --- .../RussianSuffixDecoderEncoder.java | 19 ++++++++++---- .../morphology/analayzer/SuffixEvristics.java | 17 ++++++++++++- .../RussianSuffixDecoderEncoderTest.java | 2 +- .../analayzer/ArrayEvristicsTest.java | 12 --------- .../analayzer/SuffixEvristicsTest.java | 25 +++++++++++++++++++ .../analayzer/suffix-evristics-test-data.txt | 7 ++++++ .../russian/morphology/decoder-test-data.txt | 8 +++--- 7 files changed, 67 insertions(+), 23 deletions(-) delete mode 100644 src/test/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristicsTest.java create mode 100644 src/test/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristicsTest.java create mode 100644 src/test/resources/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt diff --git a/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java b/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java index 936c24b..0ff57af 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java +++ b/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java @@ -4,15 +4,15 @@ package org.apache.lucene.russian.morphology; * This helper class allow encode suffix of russian word * to long value and decode from it. * Assumed that suffix contains only small russian letters and dash. - * Also assumed that letter å and ¸ coinsed. + * Also assumed that letter � and � coinsed. */ public class RussianSuffixDecoderEncoder { public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071; public static final int SUFFIX_LENGTH = 7; - private static final int EE_CHAR = 34; - private static final int E_CHAR = 6; - private static final int DASH_CHAR = 45; - private static final int DASH_CODE = 33; + public static final int EE_CHAR = 34; + public static final int E_CHAR = 6; + public static final int DASH_CHAR = 45; + public static final int DASH_CODE = 33; static public Long encode(String string) { @@ -43,4 +43,13 @@ public class RussianSuffixDecoderEncoder { result = (char) c + result; return result; } + + static public boolean checkCharacter(char c){ + int code = 0 + c; + if(code == 45) return true; + code -= RUSSIAN_SMALL_LETTER_OFFSET; + if(code == 34) return true; + if(code > 0 && code < 33) return true; + return false; + } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristics.java b/src/main/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristics.java index a4de266..e593511 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristics.java +++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristics.java @@ -4,6 +4,7 @@ import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; import java.io.*; import java.util.Arrays; +import java.util.HashSet; public class SuffixEvristics { @@ -43,7 +44,11 @@ public class SuffixEvristics { public String getCanonicalForm(String form) { int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0; - Long suffix = RussianSuffixDecoderEncoder.encode(form.substring(startSymbol)); + String suffixS = form.substring(startSymbol); + + if(!chechSuffix(suffixS)) return form; + + Long suffix = RussianSuffixDecoderEncoder.encode(suffixS); int index = Arrays.binarySearch(keys,suffix); if(index < -1){ @@ -54,4 +59,14 @@ public class SuffixEvristics { return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix; } } + + + private boolean chechSuffix(String suffix){ + for(int i = 0; i < suffix.length(); i++){ + if (!RussianSuffixDecoderEncoder.checkCharacter(suffix.charAt(i))) return false; + } + return true; + } + + } diff --git a/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java b/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java index 6cfb985..bac6fc7 100644 --- a/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java +++ b/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java @@ -16,7 +16,7 @@ public class RussianSuffixDecoderEncoderTest { @Test public void testShouldCorretDecodeEncode() throws IOException { InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/decoder-test-data.txt"); - BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream)); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8")); String s = bufferedReader.readLine(); while(s != null){ String[] qa = s.trim().split(" "); diff --git a/src/test/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristicsTest.java b/src/test/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristicsTest.java deleted file mode 100644 index b9ade50..0000000 --- a/src/test/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristicsTest.java +++ /dev/null @@ -1,12 +0,0 @@ -package org.apache.lucene.russian.morphology.analayzer; - -import org.junit.Test; - - -public class ArrayEvristicsTest { - - @Test - public void testShouldDefineCorretCononicalWordForm(){ - - } -} diff --git a/src/test/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristicsTest.java b/src/test/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristicsTest.java new file mode 100644 index 0000000..0d6e367 --- /dev/null +++ b/src/test/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristicsTest.java @@ -0,0 +1,25 @@ +package org.apache.lucene.russian.morphology.analayzer; + +import org.junit.Test; +import static org.junit.Assert.assertThat; +import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; +import static org.hamcrest.core.IsEqual.equalTo; + +import java.io.*; + + +public class SuffixEvristicsTest { + + @Test + public void testShouldDefineCorretCononicalWordForm() throws IOException { + SuffixEvristics suffixEvristics = new SuffixEvristics(); + InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt"); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8")); + String s = bufferedReader.readLine(); + while(s != null){ + String[] qa = s.trim().split(" "); + assertThat(suffixEvristics.getCanonicalForm(qa[0]),equalTo(qa[1])); + s = bufferedReader.readLine(); + } + } +} diff --git a/src/test/resources/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt b/src/test/resources/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt new file mode 100644 index 0000000..7f38d03 --- /dev/null +++ b/src/test/resources/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt @@ -0,0 +1,7 @@ +шел идти +туреÑтичеÑких туреÑтичеÑкий +отзывы отзыв +победы победа +поÑтичеÑÐºÐ°Ñ Ð¿Ð¾ÑтичеÑкий +произошло произойти +test test \ No newline at end of file diff --git a/src/test/resources/org/apache/lucene/russian/morphology/decoder-test-data.txt b/src/test/resources/org/apache/lucene/russian/morphology/decoder-test-data.txt index 53157c3..a7381ab 100644 --- a/src/test/resources/org/apache/lucene/russian/morphology/decoder-test-data.txt +++ b/src/test/resources/org/apache/lucene/russian/morphology/decoder-test-data.txt @@ -1,4 +1,4 @@ -òåñò òåñò -¸æ åæ -òåñòåðà òåñòåðà -÷òî-òî ÷òî-òî \ No newline at end of file +теÑÑ‚ теÑÑ‚ +ёж еж +теÑтера теÑтера +что-то что-то \ No newline at end of file