fixing problem with bad search of rule id
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@84 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
1c1e5d6354
commit
e478d86fe0
@ -28,7 +28,7 @@ import java.util.HashSet;
|
|||||||
public class EnglishHeuristicBuilder {
|
public class EnglishHeuristicBuilder {
|
||||||
public static void main(String[] args) throws IOException {
|
public static void main(String[] args) throws IOException {
|
||||||
|
|
||||||
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morphology/egramtab.tab");
|
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/egramtab.tab");
|
||||||
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());
|
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());
|
||||||
|
|
||||||
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
||||||
|
@ -27,7 +27,7 @@ import java.util.HashSet;
|
|||||||
|
|
||||||
public class RussianHeuristicBuilder {
|
public class RussianHeuristicBuilder {
|
||||||
public static void main(String[] args) throws IOException {
|
public static void main(String[] args) throws IOException {
|
||||||
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morphology/rgramtab.tab");
|
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
|
||||||
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
|
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
|
||||||
|
|
||||||
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
purchases purchas
|
purchases purchase
|
||||||
existing exist
|
existing exist
|
||||||
was be
|
was be
|
||||||
men man
|
men man
|
||||||
|
@ -18,6 +18,7 @@ package org.apache.lucene.morphology;
|
|||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
@ -102,7 +103,7 @@ public class Morphology {
|
|||||||
int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1);
|
int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1);
|
||||||
if (i3 != 0) return i3;
|
if (i3 != 0) return i3;
|
||||||
}
|
}
|
||||||
return i2.length - i1.length;
|
return i1.length - i2.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void writeToFile(String fileName) throws IOException {
|
public void writeToFile(String fileName) throws IOException {
|
||||||
@ -186,6 +187,7 @@ public class Morphology {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException {
|
private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException {
|
||||||
|
HashSet intetger = new HashSet<Integer>();
|
||||||
separators = new int[amount][];
|
separators = new int[amount][];
|
||||||
for (int i = 0; i < amount; i++) {
|
for (int i = 0; i < amount; i++) {
|
||||||
String s1 = bufferedReader.readLine();
|
String s1 = bufferedReader.readLine();
|
||||||
@ -194,6 +196,7 @@ public class Morphology {
|
|||||||
for (int j = 0; j < wordLenght; j++) {
|
for (int j = 0; j < wordLenght; j++) {
|
||||||
separators[i][j] = Integer.valueOf(bufferedReader.readLine());
|
separators[i][j] = Integer.valueOf(bufferedReader.readLine());
|
||||||
}
|
}
|
||||||
|
intetger.add(separators[i][0]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -30,14 +30,15 @@ import java.util.ArrayList;
|
|||||||
*/
|
*/
|
||||||
public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
|
public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
|
||||||
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
|
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
|
||||||
static public int SUFFIX_LENGTH = 6;
|
public static final int WORD_PART_LENGHT = 6;
|
||||||
public static final int EE_CHAR = 34;
|
public static final int EE_CHAR = 34;
|
||||||
public static final int E_CHAR = 6;
|
public static final int E_CHAR = 6;
|
||||||
public static final int DASH_CHAR = 45;
|
public static final int DASH_CHAR = 45;
|
||||||
public static final int DASH_CODE = 33;
|
public static final int DASH_CODE = 33;
|
||||||
|
|
||||||
public Integer encode(String string) {
|
public Integer encode(String string) {
|
||||||
if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12);
|
if (string.length() > WORD_PART_LENGHT)
|
||||||
|
throw new SuffixToLongException("Suffix length should not be greater then " + WORD_PART_LENGHT + " " + string);
|
||||||
int result = 0;
|
int result = 0;
|
||||||
for (int i = 0; i < string.length(); i++) {
|
for (int i = 0; i < string.length(); i++) {
|
||||||
int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
|
int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
|
||||||
@ -49,7 +50,7 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
|
|||||||
throw new WrongCharaterException("Symblo " + string.charAt(i) + " is not small cirillic letter");
|
throw new WrongCharaterException("Symblo " + string.charAt(i) + " is not small cirillic letter");
|
||||||
result = result * 34 + c;
|
result = result * 34 + c;
|
||||||
}
|
}
|
||||||
for (int i = string.length(); i < 6; i++) {
|
for (int i = string.length(); i < WORD_PART_LENGHT; i++) {
|
||||||
result *= 34;
|
result *= 34;
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
@ -57,9 +58,9 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
|
|||||||
|
|
||||||
public int[] encodeToArray(String s) {
|
public int[] encodeToArray(String s) {
|
||||||
ArrayList<Integer> integers = new ArrayList<Integer>();
|
ArrayList<Integer> integers = new ArrayList<Integer>();
|
||||||
while (s.length() > 6) {
|
while (s.length() > WORD_PART_LENGHT) {
|
||||||
integers.add(encode(s.substring(0, 6)));
|
integers.add(encode(s.substring(0, WORD_PART_LENGHT)));
|
||||||
s = s.substring(6);
|
s = s.substring(WORD_PART_LENGHT);
|
||||||
}
|
}
|
||||||
integers.add(encode(s));
|
integers.add(encode(s));
|
||||||
int[] ints = new int[integers.size()];
|
int[] ints = new int[integers.size()];
|
||||||
@ -116,6 +117,6 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public String cleanString(String s) {
|
public String cleanString(String s) {
|
||||||
return s.replace((char) (34 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
|
return s.replace((char) (EE_CHAR + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (E_CHAR + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -35,6 +35,22 @@ public class RussianLetterDecoderEncoderTest {
|
|||||||
decoderEncoder = new RussianLetterDecoderEncoder();
|
decoderEncoder = new RussianLetterDecoderEncoder();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testShouldPreserStringComporision() throws IOException {
|
||||||
|
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-monotonic.txt");
|
||||||
|
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||||
|
String s = bufferedReader.readLine();
|
||||||
|
while (s != null) {
|
||||||
|
String[] qa = s.trim().split(" ");
|
||||||
|
if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT && qa[1].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) {
|
||||||
|
assertThat(decoderEncoder.encode(qa[1]) > decoderEncoder.encode(qa[0]), equalTo(true));
|
||||||
|
}
|
||||||
|
s = bufferedReader.readLine();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testShouldCorretDecodeEncode() throws IOException {
|
public void testShouldCorretDecodeEncode() throws IOException {
|
||||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data.txt");
|
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data.txt");
|
||||||
@ -42,8 +58,10 @@ public class RussianLetterDecoderEncoderTest {
|
|||||||
String s = bufferedReader.readLine();
|
String s = bufferedReader.readLine();
|
||||||
while (s != null) {
|
while (s != null) {
|
||||||
String[] qa = s.trim().split(" ");
|
String[] qa = s.trim().split(" ");
|
||||||
|
if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) {
|
||||||
Integer ecodedSuffix = decoderEncoder.encode(qa[0]);
|
Integer ecodedSuffix = decoderEncoder.encode(qa[0]);
|
||||||
assertThat(decoderEncoder.decode(ecodedSuffix), equalTo(qa[1]));
|
assertThat(decoderEncoder.decode(ecodedSuffix), equalTo(qa[1]));
|
||||||
|
}
|
||||||
s = bufferedReader.readLine();
|
s = bufferedReader.readLine();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -11,3 +11,5 @@
|
|||||||
аааааааааааааааааа аааааааааааааааааа
|
аааааааааааааааааа аааааааааааааааааа
|
||||||
ааааааааааааааааа ааааааааааааааааа
|
ааааааааааааааааа ааааааааааааааааа
|
||||||
йфячыцувс йфячыцувс
|
йфячыцувс йфячыцувс
|
||||||
|
ёёё еее
|
||||||
|
ёёёе ееее
|
@ -1,3 +1,5 @@
|
|||||||
|
яяя яяя
|
||||||
|
юяю юяю
|
||||||
тест тест
|
тест тест
|
||||||
ёж еж
|
ёж еж
|
||||||
естера естера
|
естера естера
|
||||||
|
@ -0,0 +1,7 @@
|
|||||||
|
а аа
|
||||||
|
ааа ббб
|
||||||
|
ммм нннн
|
||||||
|
ммм ммн
|
||||||
|
аа ба
|
||||||
|
ииа к
|
||||||
|
удд уде
|
@ -11,7 +11,7 @@
|
|||||||
пушек пушка
|
пушек пушка
|
||||||
козлов козлов козловый козел
|
козлов козлов козловый козел
|
||||||
жуков жуков жук
|
жуков жуков жук
|
||||||
красив красить
|
красив красить красивый
|
||||||
красивая красивый
|
красивая красивый
|
||||||
тосклив тоскливый
|
тосклив тоскливый
|
||||||
лучший хороший
|
лучший хороший
|
||||||
|
Loading…
x
Reference in New Issue
Block a user