test for suffix evristics
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@11 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
585d43877a
commit
48ae7d4cb6
@ -4,15 +4,15 @@ package org.apache.lucene.russian.morphology;
|
||||
* This helper class allow encode suffix of russian word
|
||||
* to long value and decode from it.
|
||||
* Assumed that suffix contains only small russian letters and dash.
|
||||
* Also assumed that letter å and ¸ coinsed.
|
||||
* Also assumed that letter <EFBFBD> and <EFBFBD> coinsed.
|
||||
*/
|
||||
public class RussianSuffixDecoderEncoder {
|
||||
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
|
||||
public static final int SUFFIX_LENGTH = 7;
|
||||
private static final int EE_CHAR = 34;
|
||||
private static final int E_CHAR = 6;
|
||||
private static final int DASH_CHAR = 45;
|
||||
private static final int DASH_CODE = 33;
|
||||
public static final int EE_CHAR = 34;
|
||||
public static final int E_CHAR = 6;
|
||||
public static final int DASH_CHAR = 45;
|
||||
public static final int DASH_CODE = 33;
|
||||
|
||||
|
||||
static public Long encode(String string) {
|
||||
@ -43,4 +43,13 @@ public class RussianSuffixDecoderEncoder {
|
||||
result = (char) c + result;
|
||||
return result;
|
||||
}
|
||||
|
||||
static public boolean checkCharacter(char c){
|
||||
int code = 0 + c;
|
||||
if(code == 45) return true;
|
||||
code -= RUSSIAN_SMALL_LETTER_OFFSET;
|
||||
if(code == 34) return true;
|
||||
if(code > 0 && code < 33) return true;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -4,6 +4,7 @@ import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
|
||||
|
||||
public class SuffixEvristics {
|
||||
@ -43,7 +44,11 @@ public class SuffixEvristics {
|
||||
|
||||
public String getCanonicalForm(String form) {
|
||||
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
||||
Long suffix = RussianSuffixDecoderEncoder.encode(form.substring(startSymbol));
|
||||
String suffixS = form.substring(startSymbol);
|
||||
|
||||
if(!chechSuffix(suffixS)) return form;
|
||||
|
||||
Long suffix = RussianSuffixDecoderEncoder.encode(suffixS);
|
||||
|
||||
int index = Arrays.binarySearch(keys,suffix);
|
||||
if(index < -1){
|
||||
@ -54,4 +59,14 @@ public class SuffixEvristics {
|
||||
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean chechSuffix(String suffix){
|
||||
for(int i = 0; i < suffix.length(); i++){
|
||||
if (!RussianSuffixDecoderEncoder.checkCharacter(suffix.charAt(i))) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -16,7 +16,7 @@ public class RussianSuffixDecoderEncoderTest {
|
||||
@Test
|
||||
public void testShouldCorretDecodeEncode() throws IOException {
|
||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/decoder-test-data.txt");
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream));
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8"));
|
||||
String s = bufferedReader.readLine();
|
||||
while(s != null){
|
||||
String[] qa = s.trim().split(" ");
|
||||
|
@ -1,12 +0,0 @@
|
||||
package org.apache.lucene.russian.morphology.analayzer;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
|
||||
public class ArrayEvristicsTest {
|
||||
|
||||
@Test
|
||||
public void testShouldDefineCorretCononicalWordForm(){
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,25 @@
|
||||
package org.apache.lucene.russian.morphology.analayzer;
|
||||
|
||||
import org.junit.Test;
|
||||
import static org.junit.Assert.assertThat;
|
||||
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||
import static org.hamcrest.core.IsEqual.equalTo;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
|
||||
public class SuffixEvristicsTest {
|
||||
|
||||
@Test
|
||||
public void testShouldDefineCorretCononicalWordForm() throws IOException {
|
||||
SuffixEvristics suffixEvristics = new SuffixEvristics();
|
||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt");
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8"));
|
||||
String s = bufferedReader.readLine();
|
||||
while(s != null){
|
||||
String[] qa = s.trim().split(" ");
|
||||
assertThat(suffixEvristics.getCanonicalForm(qa[0]),equalTo(qa[1]));
|
||||
s = bufferedReader.readLine();
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,7 @@
|
||||
шел идти
|
||||
турестических турестический
|
||||
отзывы отзыв
|
||||
победы победа
|
||||
поэтическая поэтический
|
||||
произошло произойти
|
||||
test test
|
@ -1,4 +1,4 @@
|
||||
тест тест
|
||||
ёж еж
|
||||
тестера тестера
|
||||
что-то что-то
|
||||
тест тест
|
||||
ёж еж
|
||||
тестера тестера
|
||||
что-то что-то
|
Loading…
x
Reference in New Issue
Block a user