fixing problem with bad search of rule id

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@84 d817d54c-26ab-11de-abc9-2f7d1455ff7a
2009-10-21 13:43:50 +00:00
parent 1c1e5d6354
commit e478d86fe0
11 changed files with 669429 additions and 397713 deletions
@@ -28,7 +28,7 @@ import java.util.HashSet;
 public class EnglishHeuristicBuilder {
    public static void main(String[] args) throws IOException {

-        GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morphology/egramtab.tab");
+        GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/egramtab.tab");
        DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());

        EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
@@ -27,7 +27,7 @@ import java.util.HashSet;

 public class RussianHeuristicBuilder {
    public static void main(String[] args) throws IOException {
-        GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morphology/rgramtab.tab");
+        GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
        DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());

        RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
@@ -1,4 +1,4 @@
-purchases purchas
+purchases purchase
 existing exist
 was be
 men man
@@ -18,6 +18,7 @@ package org.apache.lucene.morphology;

 import java.io.*;
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.List;


@@ -102,7 +103,7 @@ public class Morphology {
            int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1);
            if (i3 != 0) return i3;
        }
-        return i2.length - i1.length;
+        return i1.length - i2.length;
    }

    public void writeToFile(String fileName) throws IOException {
@@ -186,6 +187,7 @@ public class Morphology {
    }

    private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException {
+        HashSet intetger = new HashSet<Integer>();
        separators = new int[amount][];
        for (int i = 0; i < amount; i++) {
            String s1 = bufferedReader.readLine();
@@ -194,6 +196,7 @@ public class Morphology {
            for (int j = 0; j < wordLenght; j++) {
                separators[i][j] = Integer.valueOf(bufferedReader.readLine());
            }
+            intetger.add(separators[i][0]);
        }
    }

@@ -30,14 +30,15 @@ import java.util.ArrayList;
 */
 public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
    public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
-    static public int SUFFIX_LENGTH = 6;
+    public static final int WORD_PART_LENGHT = 6;
    public static final int EE_CHAR = 34;
    public static final int E_CHAR = 6;
    public static final int DASH_CHAR = 45;
    public static final int DASH_CODE = 33;

    public Integer encode(String string) {
-        if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12);
+        if (string.length() > WORD_PART_LENGHT)
+            throw new SuffixToLongException("Suffix length should not be greater then " + WORD_PART_LENGHT + " " + string);
        int result = 0;
        for (int i = 0; i < string.length(); i++) {
            int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
@@ -49,7 +50,7 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
                throw new WrongCharaterException("Symblo " + string.charAt(i) + " is not small cirillic letter");
            result = result * 34 + c;
        }
-        for (int i = string.length(); i < 6; i++) {
+        for (int i = string.length(); i < WORD_PART_LENGHT; i++) {
            result *= 34;
        }
        return result;
@@ -57,9 +58,9 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {

    public int[] encodeToArray(String s) {
        ArrayList<Integer> integers = new ArrayList<Integer>();
-        while (s.length() > 6) {
-            integers.add(encode(s.substring(0, 6)));
-            s = s.substring(6);
+        while (s.length() > WORD_PART_LENGHT) {
+            integers.add(encode(s.substring(0, WORD_PART_LENGHT)));
+            s = s.substring(WORD_PART_LENGHT);
        }
        integers.add(encode(s));
        int[] ints = new int[integers.size()];
@@ -116,6 +117,6 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
    }

    public String cleanString(String s) {
-        return s.replace((char) (34 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
+        return s.replace((char) (EE_CHAR + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (E_CHAR + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
    }
 }
@@ -35,6 +35,22 @@ public class RussianLetterDecoderEncoderTest {
        decoderEncoder = new RussianLetterDecoderEncoder();
    }

+
+    @Test
+    public void testShouldPreserStringComporision() throws IOException {
+        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-monotonic.txt");
+        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
+        String s = bufferedReader.readLine();
+        while (s != null) {
+            String[] qa = s.trim().split(" ");
+            if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT && qa[1].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) {
+                assertThat(decoderEncoder.encode(qa[1]) > decoderEncoder.encode(qa[0]), equalTo(true));
+            }
+            s = bufferedReader.readLine();
+        }
+    }
+
+
    @Test
    public void testShouldCorretDecodeEncode() throws IOException {
        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data.txt");
@@ -42,8 +58,10 @@ public class RussianLetterDecoderEncoderTest {
        String s = bufferedReader.readLine();
        while (s != null) {
            String[] qa = s.trim().split(" ");
-            Integer ecodedSuffix = decoderEncoder.encode(qa[0]);
-            assertThat(decoderEncoder.decode(ecodedSuffix), equalTo(qa[1]));
+            if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) {
+                Integer ecodedSuffix = decoderEncoder.encode(qa[0]);
+                assertThat(decoderEncoder.decode(ecodedSuffix), equalTo(qa[1]));
+            }
            s = bufferedReader.readLine();
        }
    }
@@ -11,3 +11,5 @@
 аааааааааааааааааа аааааааааааааааааа
 ааааааааааааааааа ааааааааааааааааа
 йфячыцувс йфячыцувс
+ёёё еее
+ёёёе ееее
@@ -1,3 +1,5 @@
+яяя яяя
+юяю юяю
 тест тест
 ёж еж
 естера естера
@@ -0,0 +1,7 @@
+а аа
+ааа ббб
+ммм нннн
+ммм ммн
+аа ба
+ииа к
+удд уде
@@ -11,7 +11,7 @@
 пушек пушка
 козлов козлов козловый козел
 жуков жуков жук
-красив красить
+красив красить красивый
 красивая красивый
 тосклив тоскливый
 лучший хороший