fixing problem with bad search of rule id

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@84 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov 2009-10-21 13:43:50 +00:00
parent 1c1e5d6354
commit e478d86fe0
11 changed files with 669430 additions and 397714 deletions

View File

@ -28,7 +28,7 @@ import java.util.HashSet;
public class EnglishHeuristicBuilder { public class EnglishHeuristicBuilder {
public static void main(String[] args) throws IOException { public static void main(String[] args) throws IOException {
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morphology/egramtab.tab"); GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/egramtab.tab");
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>()); DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder(); EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();

View File

@ -27,7 +27,7 @@ import java.util.HashSet;
public class RussianHeuristicBuilder { public class RussianHeuristicBuilder {
public static void main(String[] args) throws IOException { public static void main(String[] args) throws IOException {
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morphology/rgramtab.tab"); GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>()); DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder(); RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();

View File

@ -1,4 +1,4 @@
purchases purchas purchases purchase
existing exist existing exist
was be was be
men man men man

View File

@ -18,6 +18,7 @@ package org.apache.lucene.morphology;
import java.io.*; import java.io.*;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet;
import java.util.List; import java.util.List;
@ -102,7 +103,7 @@ public class Morphology {
int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1); int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1);
if (i3 != 0) return i3; if (i3 != 0) return i3;
} }
return i2.length - i1.length; return i1.length - i2.length;
} }
public void writeToFile(String fileName) throws IOException { public void writeToFile(String fileName) throws IOException {
@ -186,6 +187,7 @@ public class Morphology {
} }
private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException { private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException {
HashSet intetger = new HashSet<Integer>();
separators = new int[amount][]; separators = new int[amount][];
for (int i = 0; i < amount; i++) { for (int i = 0; i < amount; i++) {
String s1 = bufferedReader.readLine(); String s1 = bufferedReader.readLine();
@ -194,6 +196,7 @@ public class Morphology {
for (int j = 0; j < wordLenght; j++) { for (int j = 0; j < wordLenght; j++) {
separators[i][j] = Integer.valueOf(bufferedReader.readLine()); separators[i][j] = Integer.valueOf(bufferedReader.readLine());
} }
intetger.add(separators[i][0]);
} }
} }

View File

@ -30,14 +30,15 @@ import java.util.ArrayList;
*/ */
public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071; public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
static public int SUFFIX_LENGTH = 6; public static final int WORD_PART_LENGHT = 6;
public static final int EE_CHAR = 34; public static final int EE_CHAR = 34;
public static final int E_CHAR = 6; public static final int E_CHAR = 6;
public static final int DASH_CHAR = 45; public static final int DASH_CHAR = 45;
public static final int DASH_CODE = 33; public static final int DASH_CODE = 33;
public Integer encode(String string) { public Integer encode(String string) {
if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12); if (string.length() > WORD_PART_LENGHT)
throw new SuffixToLongException("Suffix length should not be greater then " + WORD_PART_LENGHT + " " + string);
int result = 0; int result = 0;
for (int i = 0; i < string.length(); i++) { for (int i = 0; i < string.length(); i++) {
int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET; int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
@ -49,7 +50,7 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
throw new WrongCharaterException("Symblo " + string.charAt(i) + " is not small cirillic letter"); throw new WrongCharaterException("Symblo " + string.charAt(i) + " is not small cirillic letter");
result = result * 34 + c; result = result * 34 + c;
} }
for (int i = string.length(); i < 6; i++) { for (int i = string.length(); i < WORD_PART_LENGHT; i++) {
result *= 34; result *= 34;
} }
return result; return result;
@ -57,9 +58,9 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
public int[] encodeToArray(String s) { public int[] encodeToArray(String s) {
ArrayList<Integer> integers = new ArrayList<Integer>(); ArrayList<Integer> integers = new ArrayList<Integer>();
while (s.length() > 6) { while (s.length() > WORD_PART_LENGHT) {
integers.add(encode(s.substring(0, 6))); integers.add(encode(s.substring(0, WORD_PART_LENGHT)));
s = s.substring(6); s = s.substring(WORD_PART_LENGHT);
} }
integers.add(encode(s)); integers.add(encode(s));
int[] ints = new int[integers.size()]; int[] ints = new int[integers.size()];
@ -116,6 +117,6 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
} }
public String cleanString(String s) { public String cleanString(String s) {
return s.replace((char) (34 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET)); return s.replace((char) (EE_CHAR + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (E_CHAR + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
} }
} }

View File

@ -35,6 +35,22 @@ public class RussianLetterDecoderEncoderTest {
decoderEncoder = new RussianLetterDecoderEncoder(); decoderEncoder = new RussianLetterDecoderEncoder();
} }
@Test
public void testShouldPreserStringComporision() throws IOException {
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-monotonic.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
String s = bufferedReader.readLine();
while (s != null) {
String[] qa = s.trim().split(" ");
if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT && qa[1].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) {
assertThat(decoderEncoder.encode(qa[1]) > decoderEncoder.encode(qa[0]), equalTo(true));
}
s = bufferedReader.readLine();
}
}
@Test @Test
public void testShouldCorretDecodeEncode() throws IOException { public void testShouldCorretDecodeEncode() throws IOException {
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data.txt"); InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data.txt");
@ -42,8 +58,10 @@ public class RussianLetterDecoderEncoderTest {
String s = bufferedReader.readLine(); String s = bufferedReader.readLine();
while (s != null) { while (s != null) {
String[] qa = s.trim().split(" "); String[] qa = s.trim().split(" ");
if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) {
Integer ecodedSuffix = decoderEncoder.encode(qa[0]); Integer ecodedSuffix = decoderEncoder.encode(qa[0]);
assertThat(decoderEncoder.decode(ecodedSuffix), equalTo(qa[1])); assertThat(decoderEncoder.decode(ecodedSuffix), equalTo(qa[1]));
}
s = bufferedReader.readLine(); s = bufferedReader.readLine();
} }
} }

View File

@ -11,3 +11,5 @@
аааааааааааааааааа аааааааааааааааааа аааааааааааааааааа аааааааааааааааааа
ааааааааааааааааа ааааааааааааааааа ааааааааааааааааа ааааааааааааааааа
йфячыцувс йфячыцувс йфячыцувс йфячыцувс
ёёё еее
ёёёе ееее

View File

@ -1,3 +1,5 @@
яяя яяя
юяю юяю
тест тест тест тест
ёж еж ёж еж
естера естера естера естера

View File

@ -0,0 +1,7 @@
а аа
ааа ббб
ммм нннн
ммм ммн
аа ба
ииа к
удд уде

View File

@ -11,7 +11,7 @@
пушек пушка пушек пушка
козлов козлов козловый козел козлов козлов козловый козел
жуков жуков жук жуков жуков жук
красив красить красив красить красивый
красивая красивый красивая красивый
тосклив тоскливый тосклив тоскливый
лучший хороший лучший хороший