test for suffix evristics

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@11 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov 2009-04-13 14:04:53 +00:00
parent 585d43877a
commit 48ae7d4cb6
7 changed files with 67 additions and 23 deletions

View File

@ -4,15 +4,15 @@ package org.apache.lucene.russian.morphology;
* This helper class allow encode suffix of russian word * This helper class allow encode suffix of russian word
* to long value and decode from it. * to long value and decode from it.
* Assumed that suffix contains only small russian letters and dash. * Assumed that suffix contains only small russian letters and dash.
* Also assumed that letter å and ¸ coinsed. * Also assumed that letter <EFBFBD> and <EFBFBD> coinsed.
*/ */
public class RussianSuffixDecoderEncoder { public class RussianSuffixDecoderEncoder {
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071; public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
public static final int SUFFIX_LENGTH = 7; public static final int SUFFIX_LENGTH = 7;
private static final int EE_CHAR = 34; public static final int EE_CHAR = 34;
private static final int E_CHAR = 6; public static final int E_CHAR = 6;
private static final int DASH_CHAR = 45; public static final int DASH_CHAR = 45;
private static final int DASH_CODE = 33; public static final int DASH_CODE = 33;
static public Long encode(String string) { static public Long encode(String string) {
@ -43,4 +43,13 @@ public class RussianSuffixDecoderEncoder {
result = (char) c + result; result = (char) c + result;
return result; return result;
} }
static public boolean checkCharacter(char c){
int code = 0 + c;
if(code == 45) return true;
code -= RUSSIAN_SMALL_LETTER_OFFSET;
if(code == 34) return true;
if(code > 0 && code < 33) return true;
return false;
}
} }

View File

@ -4,6 +4,7 @@ import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import java.io.*; import java.io.*;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashSet;
public class SuffixEvristics { public class SuffixEvristics {
@ -43,7 +44,11 @@ public class SuffixEvristics {
public String getCanonicalForm(String form) { public String getCanonicalForm(String form) {
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0; int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
Long suffix = RussianSuffixDecoderEncoder.encode(form.substring(startSymbol)); String suffixS = form.substring(startSymbol);
if(!chechSuffix(suffixS)) return form;
Long suffix = RussianSuffixDecoderEncoder.encode(suffixS);
int index = Arrays.binarySearch(keys,suffix); int index = Arrays.binarySearch(keys,suffix);
if(index < -1){ if(index < -1){
@ -54,4 +59,14 @@ public class SuffixEvristics {
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix; return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
} }
} }
private boolean chechSuffix(String suffix){
for(int i = 0; i < suffix.length(); i++){
if (!RussianSuffixDecoderEncoder.checkCharacter(suffix.charAt(i))) return false;
}
return true;
}
} }

View File

@ -16,7 +16,7 @@ public class RussianSuffixDecoderEncoderTest {
@Test @Test
public void testShouldCorretDecodeEncode() throws IOException { public void testShouldCorretDecodeEncode() throws IOException {
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/decoder-test-data.txt"); InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/decoder-test-data.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream)); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8"));
String s = bufferedReader.readLine(); String s = bufferedReader.readLine();
while(s != null){ while(s != null){
String[] qa = s.trim().split(" "); String[] qa = s.trim().split(" ");

View File

@ -1,12 +0,0 @@
package org.apache.lucene.russian.morphology.analayzer;
import org.junit.Test;
public class ArrayEvristicsTest {
@Test
public void testShouldDefineCorretCononicalWordForm(){
}
}

View File

@ -0,0 +1,25 @@
package org.apache.lucene.russian.morphology.analayzer;
import org.junit.Test;
import static org.junit.Assert.assertThat;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import static org.hamcrest.core.IsEqual.equalTo;
import java.io.*;
public class SuffixEvristicsTest {
@Test
public void testShouldDefineCorretCononicalWordForm() throws IOException {
SuffixEvristics suffixEvristics = new SuffixEvristics();
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8"));
String s = bufferedReader.readLine();
while(s != null){
String[] qa = s.trim().split(" ");
assertThat(suffixEvristics.getCanonicalForm(qa[0]),equalTo(qa[1]));
s = bufferedReader.readLine();
}
}
}

View File

@ -0,0 +1,7 @@
шел идти
турестических турестический
отзывы отзыв
победы победа
поэтическая поэтический
произошло произойти
test test

View File

@ -1,4 +1,4 @@
тест тест тест тест
ёж еж ёж еж
тестера тестера тестера тестера
что-то что-то что-то что-то