fixing problem with bad search of rule id

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@84 d817d54c-26ab-11de-abc9-2f7d1455ff7a
2009-10-21 13:43:50 +00:00
parent 1c1e5d6354
commit e478d86fe0
11 changed files with 669430 additions and 397714 deletions
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java
@@ -28,7 +28,7 @@ import java.util.HashSet;
 public class EnglishHeuristicBuilder {
    public static void main(String[] args) throws IOException {
-        GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morphology/egramtab.tab");
+        GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/egramtab.tab");
        DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());
        EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java
@@ -27,7 +27,7 @@ import java.util.HashSet;
 public class RussianHeuristicBuilder {
    public static void main(String[] args) throws IOException {
-        GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morphology/rgramtab.tab");
+        GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
        DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
        RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
--- a/english/src/test/resources/org/apache/lucene/morphology/english/english-morphology-test.txt
+++ b/english/src/test/resources/org/apache/lucene/morphology/english/english-morphology-test.txt
@@ -1,4 +1,4 @@
-purchases purchas
+purchases purchase
 existing exist
 was be
 men man
--- a/morph/src/main/java/org/apache/lucene/morphology/Morphology.java
+++ b/morph/src/main/java/org/apache/lucene/morphology/Morphology.java
@@ -18,6 +18,7 @@ package org.apache.lucene.morphology;
 import java.io.*;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
@@ -102,7 +103,7 @@ public class Morphology {
            int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1);
            if (i3 != 0) return i3;
        }
-        return i2.length - i1.length;
+        return i1.length - i2.length;
    }
    public void writeToFile(String fileName) throws IOException {
@@ -186,6 +187,7 @@ public class Morphology {
    }
    private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException {
        HashSet intetger = new HashSet<Integer>();
        separators = new int[amount][];
        for (int i = 0; i < amount; i++) {
            String s1 = bufferedReader.readLine();
@@ -194,6 +196,7 @@ public class Morphology {
            for (int j = 0; j < wordLenght; j++) {
                separators[i][j] = Integer.valueOf(bufferedReader.readLine());
            }
            intetger.add(separators[i][0]);
        }
    }
--- a/russian/src/main/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoder.java
+++ b/russian/src/main/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoder.java
@@ -30,14 +30,15 @@ import java.util.ArrayList;
 */
 public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
    public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
-    static public int SUFFIX_LENGTH = 6;
+    public static final int WORD_PART_LENGHT = 6;
    public static final int EE_CHAR = 34;
    public static final int E_CHAR = 6;
    public static final int DASH_CHAR = 45;
    public static final int DASH_CODE = 33;
    public Integer encode(String string) {
-        if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12);
+        if (string.length() > WORD_PART_LENGHT)
            throw new SuffixToLongException("Suffix length should not be greater then " + WORD_PART_LENGHT + " " + string);
        int result = 0;
        for (int i = 0; i < string.length(); i++) {
            int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
@@ -49,7 +50,7 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
                throw new WrongCharaterException("Symblo " + string.charAt(i) + " is not small cirillic letter");
            result = result * 34 + c;
        }
-        for (int i = string.length(); i < 6; i++) {
+        for (int i = string.length(); i < WORD_PART_LENGHT; i++) {
            result *= 34;
        }
        return result;
@@ -57,9 +58,9 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
    public int[] encodeToArray(String s) {
        ArrayList<Integer> integers = new ArrayList<Integer>();
-        while (s.length() > 6) {
+        while (s.length() > WORD_PART_LENGHT) {
-            integers.add(encode(s.substring(0, 6)));
+            integers.add(encode(s.substring(0, WORD_PART_LENGHT)));
-            s = s.substring(6);
+            s = s.substring(WORD_PART_LENGHT);
        }
        integers.add(encode(s));
        int[] ints = new int[integers.size()];
@@ -116,6 +117,6 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
    }
    public String cleanString(String s) {
-        return s.replace((char) (34 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
+        return s.replace((char) (EE_CHAR + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (E_CHAR + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
    }
 }
--- a/russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info
+++ b/russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info
--- a/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoderTest.java
+++ b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoderTest.java
@@ -35,6 +35,22 @@ public class RussianLetterDecoderEncoderTest {
        decoderEncoder = new RussianLetterDecoderEncoder();
    }
    @Test
    public void testShouldPreserStringComporision() throws IOException {
        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-monotonic.txt");
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
        String s = bufferedReader.readLine();
        while (s != null) {
            String[] qa = s.trim().split(" ");
            if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT && qa[1].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) {
                assertThat(decoderEncoder.encode(qa[1]) > decoderEncoder.encode(qa[0]), equalTo(true));
            }
            s = bufferedReader.readLine();
        }
    }
    @Test
    public void testShouldCorretDecodeEncode() throws IOException {
        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data.txt");
@@ -42,8 +58,10 @@ public class RussianLetterDecoderEncoderTest {
        String s = bufferedReader.readLine();
        while (s != null) {
            String[] qa = s.trim().split(" ");
            if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) {
                Integer ecodedSuffix = decoderEncoder.encode(qa[0]);
                assertThat(decoderEncoder.decode(ecodedSuffix), equalTo(qa[1]));
            }
            s = bufferedReader.readLine();
        }
    }
--- a/russian/src/test/resources/org/apache/lucene/morphology/russian/decoder-test-data-for-array.txt
+++ b/russian/src/test/resources/org/apache/lucene/morphology/russian/decoder-test-data-for-array.txt
@@ -11,3 +11,5 @@
 аааааааааааааааааа аааааааааааааааааа
 ааааааааааааааааа ааааааааааааааааа
 йфячыцувс йфячыцувс
 ёёё еее
 ёёёе ееее
--- a/russian/src/test/resources/org/apache/lucene/morphology/russian/decoder-test-data.txt
+++ b/russian/src/test/resources/org/apache/lucene/morphology/russian/decoder-test-data.txt
@@ -1,3 +1,5 @@
 яяя яяя
 юяю юяю
 тест тест
 ёж еж
 естера естера
--- a/russian/src/test/resources/org/apache/lucene/morphology/russian/decoder-test-monotonic.txt
+++ b/russian/src/test/resources/org/apache/lucene/morphology/russian/decoder-test-monotonic.txt
@@ -0,0 +1,7 @@
 а аа
 ааа ббб
 ммм нннн
 ммм ммн
 аа ба
 ииа к
 удд уде
--- a/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-morphology-test.txt
+++ b/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-morphology-test.txt
@@ -11,7 +11,7 @@
 пушек пушка
 козлов козлов козловый козел
 жуков жуков жук
-красив красить
+красив красить красивый
 красивая красивый
 тосклив тоскливый
 лучший хороший