fixing problem with bad search of rule id
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@84 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
		| @@ -28,7 +28,7 @@ import java.util.HashSet; | |||||||
| public class EnglishHeuristicBuilder { | public class EnglishHeuristicBuilder { | ||||||
|     public static void main(String[] args) throws IOException { |     public static void main(String[] args) throws IOException { | ||||||
|  |  | ||||||
|         GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morphology/egramtab.tab"); |         GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/egramtab.tab"); | ||||||
|         DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>()); |         DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>()); | ||||||
|  |  | ||||||
|         EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder(); |         EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder(); | ||||||
|   | |||||||
| @@ -27,7 +27,7 @@ import java.util.HashSet; | |||||||
|  |  | ||||||
| public class RussianHeuristicBuilder { | public class RussianHeuristicBuilder { | ||||||
|     public static void main(String[] args) throws IOException { |     public static void main(String[] args) throws IOException { | ||||||
|         GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morphology/rgramtab.tab"); |         GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab"); | ||||||
|         DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>()); |         DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>()); | ||||||
|  |  | ||||||
|         RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder(); |         RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder(); | ||||||
|   | |||||||
| @@ -1,4 +1,4 @@ | |||||||
| purchases purchas | purchases purchase | ||||||
| existing exist | existing exist | ||||||
| was be | was be | ||||||
| men man | men man | ||||||
|   | |||||||
| @@ -18,6 +18,7 @@ package org.apache.lucene.morphology; | |||||||
|  |  | ||||||
| import java.io.*; | import java.io.*; | ||||||
| import java.util.ArrayList; | import java.util.ArrayList; | ||||||
|  | import java.util.HashSet; | ||||||
| import java.util.List; | import java.util.List; | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -102,7 +103,7 @@ public class Morphology { | |||||||
|             int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1); |             int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1); | ||||||
|             if (i3 != 0) return i3; |             if (i3 != 0) return i3; | ||||||
|         } |         } | ||||||
|         return i2.length - i1.length; |         return i1.length - i2.length; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     public void writeToFile(String fileName) throws IOException { |     public void writeToFile(String fileName) throws IOException { | ||||||
| @@ -186,6 +187,7 @@ public class Morphology { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException { |     private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException { | ||||||
|  |         HashSet intetger = new HashSet<Integer>(); | ||||||
|         separators = new int[amount][]; |         separators = new int[amount][]; | ||||||
|         for (int i = 0; i < amount; i++) { |         for (int i = 0; i < amount; i++) { | ||||||
|             String s1 = bufferedReader.readLine(); |             String s1 = bufferedReader.readLine(); | ||||||
| @@ -194,6 +196,7 @@ public class Morphology { | |||||||
|             for (int j = 0; j < wordLenght; j++) { |             for (int j = 0; j < wordLenght; j++) { | ||||||
|                 separators[i][j] = Integer.valueOf(bufferedReader.readLine()); |                 separators[i][j] = Integer.valueOf(bufferedReader.readLine()); | ||||||
|             } |             } | ||||||
|  |             intetger.add(separators[i][0]); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -30,14 +30,15 @@ import java.util.ArrayList; | |||||||
|  */ |  */ | ||||||
| public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { | public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { | ||||||
|     public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071; |     public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071; | ||||||
|     static public int SUFFIX_LENGTH = 6; |     public static final int WORD_PART_LENGHT = 6; | ||||||
|     public static final int EE_CHAR = 34; |     public static final int EE_CHAR = 34; | ||||||
|     public static final int E_CHAR = 6; |     public static final int E_CHAR = 6; | ||||||
|     public static final int DASH_CHAR = 45; |     public static final int DASH_CHAR = 45; | ||||||
|     public static final int DASH_CODE = 33; |     public static final int DASH_CODE = 33; | ||||||
|  |  | ||||||
|     public Integer encode(String string) { |     public Integer encode(String string) { | ||||||
|         if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12); |         if (string.length() > WORD_PART_LENGHT) | ||||||
|  |             throw new SuffixToLongException("Suffix length should not be greater then " + WORD_PART_LENGHT + " " + string); | ||||||
|         int result = 0; |         int result = 0; | ||||||
|         for (int i = 0; i < string.length(); i++) { |         for (int i = 0; i < string.length(); i++) { | ||||||
|             int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET; |             int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET; | ||||||
| @@ -49,7 +50,7 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { | |||||||
|                 throw new WrongCharaterException("Symblo " + string.charAt(i) + " is not small cirillic letter"); |                 throw new WrongCharaterException("Symblo " + string.charAt(i) + " is not small cirillic letter"); | ||||||
|             result = result * 34 + c; |             result = result * 34 + c; | ||||||
|         } |         } | ||||||
|         for (int i = string.length(); i < 6; i++) { |         for (int i = string.length(); i < WORD_PART_LENGHT; i++) { | ||||||
|             result *= 34; |             result *= 34; | ||||||
|         } |         } | ||||||
|         return result; |         return result; | ||||||
| @@ -57,9 +58,9 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { | |||||||
|  |  | ||||||
|     public int[] encodeToArray(String s) { |     public int[] encodeToArray(String s) { | ||||||
|         ArrayList<Integer> integers = new ArrayList<Integer>(); |         ArrayList<Integer> integers = new ArrayList<Integer>(); | ||||||
|         while (s.length() > 6) { |         while (s.length() > WORD_PART_LENGHT) { | ||||||
|             integers.add(encode(s.substring(0, 6))); |             integers.add(encode(s.substring(0, WORD_PART_LENGHT))); | ||||||
|             s = s.substring(6); |             s = s.substring(WORD_PART_LENGHT); | ||||||
|         } |         } | ||||||
|         integers.add(encode(s)); |         integers.add(encode(s)); | ||||||
|         int[] ints = new int[integers.size()]; |         int[] ints = new int[integers.size()]; | ||||||
| @@ -116,6 +117,6 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     public String cleanString(String s) { |     public String cleanString(String s) { | ||||||
|         return s.replace((char) (34 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET)); |         return s.replace((char) (EE_CHAR + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (E_CHAR + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET)); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -35,6 +35,22 @@ public class RussianLetterDecoderEncoderTest { | |||||||
|         decoderEncoder = new RussianLetterDecoderEncoder(); |         decoderEncoder = new RussianLetterDecoderEncoder(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     @Test | ||||||
|  |     public void testShouldPreserStringComporision() throws IOException { | ||||||
|  |         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-monotonic.txt"); | ||||||
|  |         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); | ||||||
|  |         String s = bufferedReader.readLine(); | ||||||
|  |         while (s != null) { | ||||||
|  |             String[] qa = s.trim().split(" "); | ||||||
|  |             if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT && qa[1].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) { | ||||||
|  |                 assertThat(decoderEncoder.encode(qa[1]) > decoderEncoder.encode(qa[0]), equalTo(true)); | ||||||
|  |             } | ||||||
|  |             s = bufferedReader.readLine(); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|     @Test |     @Test | ||||||
|     public void testShouldCorretDecodeEncode() throws IOException { |     public void testShouldCorretDecodeEncode() throws IOException { | ||||||
|         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data.txt"); |         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data.txt"); | ||||||
| @@ -42,8 +58,10 @@ public class RussianLetterDecoderEncoderTest { | |||||||
|         String s = bufferedReader.readLine(); |         String s = bufferedReader.readLine(); | ||||||
|         while (s != null) { |         while (s != null) { | ||||||
|             String[] qa = s.trim().split(" "); |             String[] qa = s.trim().split(" "); | ||||||
|             Integer ecodedSuffix = decoderEncoder.encode(qa[0]); |             if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) { | ||||||
|             assertThat(decoderEncoder.decode(ecodedSuffix), equalTo(qa[1])); |                 Integer ecodedSuffix = decoderEncoder.encode(qa[0]); | ||||||
|  |                 assertThat(decoderEncoder.decode(ecodedSuffix), equalTo(qa[1])); | ||||||
|  |             } | ||||||
|             s = bufferedReader.readLine(); |             s = bufferedReader.readLine(); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -11,3 +11,5 @@ | |||||||
| аааааааааааааааааа аааааааааааааааааа | аааааааааааааааааа аааааааааааааааааа | ||||||
| ааааааааааааааааа ааааааааааааааааа | ааааааааааааааааа ааааааааааааааааа | ||||||
| йфячыцувс йфячыцувс | йфячыцувс йфячыцувс | ||||||
|  | ёёё еее | ||||||
|  | ёёёе ееее | ||||||
| @@ -1,3 +1,5 @@ | |||||||
|  | яяя яяя | ||||||
|  | юяю юяю | ||||||
| тест тест | тест тест | ||||||
| ёж еж | ёж еж | ||||||
| естера естера | естера естера | ||||||
|   | |||||||
| @@ -0,0 +1,7 @@ | |||||||
|  | а аа | ||||||
|  | ааа ббб | ||||||
|  | ммм нннн | ||||||
|  | ммм ммн | ||||||
|  | аа ба | ||||||
|  | ииа к | ||||||
|  | удд уде | ||||||
| @@ -11,7 +11,7 @@ | |||||||
| пушек пушка | пушек пушка | ||||||
| козлов козлов козловый козел | козлов козлов козловый козел | ||||||
| жуков жуков жук | жуков жуков жук | ||||||
| красив красить | красив красить красивый | ||||||
| красивая красивый | красивая красивый | ||||||
| тосклив тоскливый | тосклив тоскливый | ||||||
| лучший хороший | лучший хороший | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 alexander.a.kuznetsov
					alexander.a.kuznetsov