test for suffix evristics
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@11 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
		| @@ -4,15 +4,15 @@ package org.apache.lucene.russian.morphology; | ||||
|  * This helper class allow encode suffix of russian word | ||||
|  * to long value and decode from it. | ||||
|  * Assumed that suffix contains only small russian letters and dash. | ||||
|  * Also assumed that letter <20> and <20> coinsed. | ||||
|  * Also assumed that letter <20> and <20> coinsed. | ||||
|  */ | ||||
| public class RussianSuffixDecoderEncoder { | ||||
|     public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071; | ||||
|     public static final int SUFFIX_LENGTH = 7; | ||||
|     private static final int EE_CHAR = 34; | ||||
|     private static final int E_CHAR = 6; | ||||
|     private static final int DASH_CHAR = 45; | ||||
|     private static final int DASH_CODE = 33; | ||||
|     public static final int EE_CHAR = 34; | ||||
|     public static final int E_CHAR = 6; | ||||
|     public static final int DASH_CHAR = 45; | ||||
|     public static final int DASH_CODE = 33; | ||||
|  | ||||
|  | ||||
|     static public Long encode(String string) { | ||||
| @@ -43,4 +43,13 @@ public class RussianSuffixDecoderEncoder { | ||||
|         result = (char) c + result; | ||||
|         return result; | ||||
|     } | ||||
|  | ||||
|     static public boolean checkCharacter(char c){ | ||||
|          int code = 0 + c; | ||||
|          if(code == 45) return true; | ||||
|          code -= RUSSIAN_SMALL_LETTER_OFFSET; | ||||
|          if(code == 34) return true; | ||||
|          if(code > 0 && code < 33) return true; | ||||
|          return false; | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -4,6 +4,7 @@ import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; | ||||
|  | ||||
| import java.io.*; | ||||
| import java.util.Arrays; | ||||
| import java.util.HashSet; | ||||
|  | ||||
|  | ||||
| public class SuffixEvristics { | ||||
| @@ -43,7 +44,11 @@ public class SuffixEvristics { | ||||
|  | ||||
|     public String getCanonicalForm(String form) { | ||||
|         int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0; | ||||
|         Long suffix = RussianSuffixDecoderEncoder.encode(form.substring(startSymbol)); | ||||
|         String suffixS = form.substring(startSymbol); | ||||
|  | ||||
|         if(!chechSuffix(suffixS)) return form; | ||||
|  | ||||
|         Long suffix = RussianSuffixDecoderEncoder.encode(suffixS); | ||||
|  | ||||
|         int index = Arrays.binarySearch(keys,suffix); | ||||
|         if(index < -1){ | ||||
| @@ -54,4 +59,14 @@ public class SuffixEvristics { | ||||
|             return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|  | ||||
|     private boolean chechSuffix(String suffix){ | ||||
|         for(int i = 0; i < suffix.length(); i++){ | ||||
|             if (!RussianSuffixDecoderEncoder.checkCharacter(suffix.charAt(i))) return false; | ||||
|         } | ||||
|         return true; | ||||
|     } | ||||
|  | ||||
|  | ||||
| } | ||||
|   | ||||
| @@ -16,7 +16,7 @@ public class RussianSuffixDecoderEncoderTest { | ||||
|     @Test | ||||
|     public void testShouldCorretDecodeEncode() throws IOException { | ||||
|        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/decoder-test-data.txt"); | ||||
|        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream)); | ||||
|        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8")); | ||||
|        String s = bufferedReader.readLine(); | ||||
|         while(s != null){ | ||||
|             String[] qa = s.trim().split(" "); | ||||
|   | ||||
| @@ -1,12 +0,0 @@ | ||||
| package org.apache.lucene.russian.morphology.analayzer; | ||||
|  | ||||
| import org.junit.Test; | ||||
|  | ||||
|  | ||||
| public class ArrayEvristicsTest { | ||||
|  | ||||
|     @Test | ||||
|     public void testShouldDefineCorretCononicalWordForm(){ | ||||
|  | ||||
|     } | ||||
| } | ||||
| @@ -0,0 +1,25 @@ | ||||
| package org.apache.lucene.russian.morphology.analayzer; | ||||
|  | ||||
| import org.junit.Test; | ||||
| import static org.junit.Assert.assertThat; | ||||
| import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; | ||||
| import static org.hamcrest.core.IsEqual.equalTo; | ||||
|  | ||||
| import java.io.*; | ||||
|  | ||||
|  | ||||
| public class SuffixEvristicsTest { | ||||
|  | ||||
|     @Test | ||||
|     public void testShouldDefineCorretCononicalWordForm() throws IOException { | ||||
|        SuffixEvristics suffixEvristics = new SuffixEvristics(); | ||||
|        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt"); | ||||
|        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8")); | ||||
|        String s = bufferedReader.readLine(); | ||||
|         while(s != null){ | ||||
|             String[] qa = s.trim().split(" "); | ||||
|             assertThat(suffixEvristics.getCanonicalForm(qa[0]),equalTo(qa[1])); | ||||
|             s = bufferedReader.readLine(); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| @@ -0,0 +1,7 @@ | ||||
| шел идти | ||||
| турестических турестический | ||||
| отзывы отзыв | ||||
| победы победа | ||||
| поэтическая поэтический | ||||
| произошло произойти | ||||
| test test | ||||
| @@ -1,4 +1,4 @@ | ||||
| <EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> | ||||
| <EFBFBD><EFBFBD> <20><> | ||||
| <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> | ||||
| <EFBFBD><EFBFBD><EFBFBD>-<2D><> <20><><EFBFBD>-<2D><> | ||||
| тест тест | ||||
| ёж еж | ||||
| тестера тестера | ||||
| что-то что-то | ||||
		Reference in New Issue
	
	Block a user
	 alexander.a.kuznetsov
					alexander.a.kuznetsov