test for suffix evristics
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@11 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
585d43877a
commit
48ae7d4cb6
@ -4,15 +4,15 @@ package org.apache.lucene.russian.morphology;
|
|||||||
* This helper class allow encode suffix of russian word
|
* This helper class allow encode suffix of russian word
|
||||||
* to long value and decode from it.
|
* to long value and decode from it.
|
||||||
* Assumed that suffix contains only small russian letters and dash.
|
* Assumed that suffix contains only small russian letters and dash.
|
||||||
* Also assumed that letter å and ¸ coinsed.
|
* Also assumed that letter <EFBFBD> and <EFBFBD> coinsed.
|
||||||
*/
|
*/
|
||||||
public class RussianSuffixDecoderEncoder {
|
public class RussianSuffixDecoderEncoder {
|
||||||
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
|
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
|
||||||
public static final int SUFFIX_LENGTH = 7;
|
public static final int SUFFIX_LENGTH = 7;
|
||||||
private static final int EE_CHAR = 34;
|
public static final int EE_CHAR = 34;
|
||||||
private static final int E_CHAR = 6;
|
public static final int E_CHAR = 6;
|
||||||
private static final int DASH_CHAR = 45;
|
public static final int DASH_CHAR = 45;
|
||||||
private static final int DASH_CODE = 33;
|
public static final int DASH_CODE = 33;
|
||||||
|
|
||||||
|
|
||||||
static public Long encode(String string) {
|
static public Long encode(String string) {
|
||||||
@ -43,4 +43,13 @@ public class RussianSuffixDecoderEncoder {
|
|||||||
result = (char) c + result;
|
result = (char) c + result;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static public boolean checkCharacter(char c){
|
||||||
|
int code = 0 + c;
|
||||||
|
if(code == 45) return true;
|
||||||
|
code -= RUSSIAN_SMALL_LETTER_OFFSET;
|
||||||
|
if(code == 34) return true;
|
||||||
|
if(code > 0 && code < 33) return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -4,6 +4,7 @@ import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
|||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
|
|
||||||
public class SuffixEvristics {
|
public class SuffixEvristics {
|
||||||
@ -43,7 +44,11 @@ public class SuffixEvristics {
|
|||||||
|
|
||||||
public String getCanonicalForm(String form) {
|
public String getCanonicalForm(String form) {
|
||||||
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
||||||
Long suffix = RussianSuffixDecoderEncoder.encode(form.substring(startSymbol));
|
String suffixS = form.substring(startSymbol);
|
||||||
|
|
||||||
|
if(!chechSuffix(suffixS)) return form;
|
||||||
|
|
||||||
|
Long suffix = RussianSuffixDecoderEncoder.encode(suffixS);
|
||||||
|
|
||||||
int index = Arrays.binarySearch(keys,suffix);
|
int index = Arrays.binarySearch(keys,suffix);
|
||||||
if(index < -1){
|
if(index < -1){
|
||||||
@ -54,4 +59,14 @@ public class SuffixEvristics {
|
|||||||
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
|
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean chechSuffix(String suffix){
|
||||||
|
for(int i = 0; i < suffix.length(); i++){
|
||||||
|
if (!RussianSuffixDecoderEncoder.checkCharacter(suffix.charAt(i))) return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -16,7 +16,7 @@ public class RussianSuffixDecoderEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testShouldCorretDecodeEncode() throws IOException {
|
public void testShouldCorretDecodeEncode() throws IOException {
|
||||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/decoder-test-data.txt");
|
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/decoder-test-data.txt");
|
||||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream));
|
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8"));
|
||||||
String s = bufferedReader.readLine();
|
String s = bufferedReader.readLine();
|
||||||
while(s != null){
|
while(s != null){
|
||||||
String[] qa = s.trim().split(" ");
|
String[] qa = s.trim().split(" ");
|
||||||
|
@ -1,12 +0,0 @@
|
|||||||
package org.apache.lucene.russian.morphology.analayzer;
|
|
||||||
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
|
|
||||||
public class ArrayEvristicsTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testShouldDefineCorretCononicalWordForm(){
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
@ -0,0 +1,25 @@
|
|||||||
|
package org.apache.lucene.russian.morphology.analayzer;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
import static org.junit.Assert.assertThat;
|
||||||
|
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||||
|
import static org.hamcrest.core.IsEqual.equalTo;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
|
||||||
|
|
||||||
|
public class SuffixEvristicsTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testShouldDefineCorretCononicalWordForm() throws IOException {
|
||||||
|
SuffixEvristics suffixEvristics = new SuffixEvristics();
|
||||||
|
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt");
|
||||||
|
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8"));
|
||||||
|
String s = bufferedReader.readLine();
|
||||||
|
while(s != null){
|
||||||
|
String[] qa = s.trim().split(" ");
|
||||||
|
assertThat(suffixEvristics.getCanonicalForm(qa[0]),equalTo(qa[1]));
|
||||||
|
s = bufferedReader.readLine();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,7 @@
|
|||||||
|
шел идти
|
||||||
|
турестических турестический
|
||||||
|
отзывы отзыв
|
||||||
|
победы победа
|
||||||
|
поэтическая поэтический
|
||||||
|
произошло произойти
|
||||||
|
test test
|
@ -1,4 +1,4 @@
|
|||||||
тест тест
|
тест тест
|
||||||
ёж еж
|
ёж еж
|
||||||
тестера тестера
|
тестера тестера
|
||||||
что-то что-то
|
что-то что-то
|
Loading…
x
Reference in New Issue
Block a user