adding test for lucene analayzer

fixed problem with string checking

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@62 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov 2009-10-17 17:06:55 +00:00
parent 26ca704ec0
commit 8c833132a8
11 changed files with 196 additions and 35 deletions

View File

@ -1,26 +1,27 @@
/** /**
* Copyright 2009 Alexander Kuznetsov * Copyright 2009 Alexander Kuznetsov
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.morphology.russian; package org.apache.lucene.morphology.english;
import org.junit.Test; import org.apache.lucene.morphology.analayzer.MorphlogyAnalayzer;
public class AnalayzerTest { import java.io.IOException;
@Test
public void shoudGetCorrentTokens() { public class EnglishAnalayzer extends MorphlogyAnalayzer {
public EnglishAnalayzer() throws IOException {
} super(EnglishAnalayzer.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
} }
}

View File

@ -0,0 +1,65 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.english;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import static org.hamcrest.Matchers.equalTo;
import static org.junit.Assert.assertThat;
import org.junit.Test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashSet;
public class EnglishAnalayzerTest {
@Test
public void shoudGiveCorretWords() throws IOException {
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-answer.txt");
BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
HashSet<String> answer = new HashSet<String>(Arrays.asList(strings));
stream.close();
EnglishAnalayzer morphlogyAnalayzer = new EnglishAnalayzer();
stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-data.txt");
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
final Token reusableToken = new Token();
Token nextToken;
TokenStream in = morphlogyAnalayzer.tokenStream(null, reader);
HashSet<String> result = new HashSet<String>();
for (; ;) {
nextToken = in.next(reusableToken);
if (nextToken == null) {
break;
}
result.add(nextToken.term());
}
stream.close();
assertThat(result, equalTo(answer));
}
}

View File

@ -16,10 +16,17 @@
package org.apache.lucene.morphology.english; package org.apache.lucene.morphology.english;
import org.apache.lucene.morphology.LuceneMorph; import org.apache.lucene.morphology.LuceneMorph;
import static org.hamcrest.Matchers.equalTo;
import static org.junit.Assert.assertThat;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Set;
public class EnglishLuceneMorphTest { public class EnglishLuceneMorphTest {
private LuceneMorph luceneMorph; private LuceneMorph luceneMorph;
@ -31,11 +38,18 @@ public class EnglishLuceneMorphTest {
@Test @Test
public void shoudGetCorrentMorphInfo() throws IOException { public void shoudGetCorrentMorphInfo() throws IOException {
System.out.println(luceneMorph.getMorhInfo("purchases")); InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/english-morphology-test.txt");
System.out.println(luceneMorph.getMorhInfo("existing")); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
System.out.println(luceneMorph.getMorhInfo("was")); String s = bufferedReader.readLine();
System.out.println(luceneMorph.getMorhInfo("men")); while (s != null) {
System.out.println(luceneMorph.getMorhInfo("bore")); String[] qa = s.trim().split(" ");
System.out.println(luceneMorph.getMorhInfo("came")); Set<String> result = new HashSet<String>();
for (int i = 1; i < qa.length; i++) {
result.add(qa[i]);
}
Set<String> stringList = new HashSet<String>(luceneMorph.getMorhInfo(qa[0]));
assertThat(stringList, equalTo(result));
s = bufferedReader.readLine();
}
} }
} }

View File

@ -0,0 +1,7 @@
purchases purchas
existing exist
was be
men man
bore bore bear
grown grow grown
came come

View File

@ -0,0 +1 @@
following follow the instruction exactly will be help ensure the best well good result

View File

@ -0,0 +1 @@
Following the instructions exactly will help ensure the best results

View File

@ -75,4 +75,8 @@ public class LuceneMorph extends Morph {
} }
return result.toArray(new Heuristic[result.size()]); return result.toArray(new Heuristic[result.size()]);
} }
public boolean checkString(String s) {
return decoderEncoder.checkString(s);
}
} }

View File

@ -53,9 +53,7 @@ public class MorphlogyFilter extends TokenFilter {
Token nextToken = input.next(reusableToken); Token nextToken = input.next(reusableToken);
if (nextToken == null) return null; // EOS; iterator exhausted if (nextToken == null) return null; // EOS; iterator exhausted
Character testC = nextToken.term().charAt(0); if (!luceneMorph.checkString(nextToken.term())) {
//todo check here for decoder endocoder
if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC) {
return nextToken; return nextToken;
} }
stack = luceneMorph.getMorhInfo(nextToken.term()); stack = luceneMorph.getMorhInfo(nextToken.term());

View File

@ -0,0 +1,68 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.russian;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import static org.hamcrest.Matchers.equalTo;
import static org.junit.Assert.assertThat;
import org.junit.Test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashSet;
public class RussianAnalayzerTest {
@Test
public void shoudGiveCorretWords() throws IOException {
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-answer.txt");
BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
HashSet<String> answer = new HashSet<String>(Arrays.asList(strings));
stream.close();
RussianAnalayzer morphlogyAnalayzer = new RussianAnalayzer();
stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-data.txt");
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
final Token reusableToken = new Token();
Token nextToken;
TokenStream in = morphlogyAnalayzer.tokenStream(null, reader);
HashSet<String> result = new HashSet<String>();
for (; ;) {
nextToken = in.next(reusableToken);
if (nextToken == null) {
break;
}
result.add(nextToken.term());
//
}
stream.close();
assertThat(result, equalTo(answer));
}
}

View File

@ -0,0 +1 @@
в результат крушение погибнуть командир отряд специальный назначение пря при переть гувд ростовский область полковник милиция михаил перов и предприниматель

View File

@ -0,0 +1 @@
В результате крушения погибли командир отряда специального назначения при ГУВД Ростовской области полковник милиции Михаил Перов и предприниматель