adding test for lucene analayzer
fixed problem with string checking git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@62 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
26ca704ec0
commit
8c833132a8
@ -13,14 +13,15 @@
|
|||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.morphology.russian;
|
package org.apache.lucene.morphology.english;
|
||||||
|
|
||||||
import org.junit.Test;
|
import org.apache.lucene.morphology.analayzer.MorphlogyAnalayzer;
|
||||||
|
|
||||||
public class AnalayzerTest {
|
import java.io.IOException;
|
||||||
|
|
||||||
@Test
|
|
||||||
public void shoudGetCorrentTokens() {
|
|
||||||
|
|
||||||
|
public class EnglishAnalayzer extends MorphlogyAnalayzer {
|
||||||
|
public EnglishAnalayzer() throws IOException {
|
||||||
|
super(EnglishAnalayzer.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -0,0 +1,65 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.morphology.english;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
|
import static org.junit.Assert.assertThat;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
|
|
||||||
|
public class EnglishAnalayzerTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shoudGiveCorretWords() throws IOException {
|
||||||
|
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-answer.txt");
|
||||||
|
BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||||
|
String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
|
||||||
|
HashSet<String> answer = new HashSet<String>(Arrays.asList(strings));
|
||||||
|
stream.close();
|
||||||
|
|
||||||
|
EnglishAnalayzer morphlogyAnalayzer = new EnglishAnalayzer();
|
||||||
|
stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-data.txt");
|
||||||
|
|
||||||
|
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||||
|
final Token reusableToken = new Token();
|
||||||
|
|
||||||
|
Token nextToken;
|
||||||
|
TokenStream in = morphlogyAnalayzer.tokenStream(null, reader);
|
||||||
|
HashSet<String> result = new HashSet<String>();
|
||||||
|
for (; ;) {
|
||||||
|
nextToken = in.next(reusableToken);
|
||||||
|
|
||||||
|
if (nextToken == null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
result.add(nextToken.term());
|
||||||
|
}
|
||||||
|
|
||||||
|
stream.close();
|
||||||
|
|
||||||
|
assertThat(result, equalTo(answer));
|
||||||
|
}
|
||||||
|
}
|
@ -16,10 +16,17 @@
|
|||||||
package org.apache.lucene.morphology.english;
|
package org.apache.lucene.morphology.english;
|
||||||
|
|
||||||
import org.apache.lucene.morphology.LuceneMorph;
|
import org.apache.lucene.morphology.LuceneMorph;
|
||||||
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
|
import static org.junit.Assert.assertThat;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
public class EnglishLuceneMorphTest {
|
public class EnglishLuceneMorphTest {
|
||||||
private LuceneMorph luceneMorph;
|
private LuceneMorph luceneMorph;
|
||||||
@ -31,11 +38,18 @@ public class EnglishLuceneMorphTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void shoudGetCorrentMorphInfo() throws IOException {
|
public void shoudGetCorrentMorphInfo() throws IOException {
|
||||||
System.out.println(luceneMorph.getMorhInfo("purchases"));
|
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/english-morphology-test.txt");
|
||||||
System.out.println(luceneMorph.getMorhInfo("existing"));
|
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||||
System.out.println(luceneMorph.getMorhInfo("was"));
|
String s = bufferedReader.readLine();
|
||||||
System.out.println(luceneMorph.getMorhInfo("men"));
|
while (s != null) {
|
||||||
System.out.println(luceneMorph.getMorhInfo("bore"));
|
String[] qa = s.trim().split(" ");
|
||||||
System.out.println(luceneMorph.getMorhInfo("came"));
|
Set<String> result = new HashSet<String>();
|
||||||
|
for (int i = 1; i < qa.length; i++) {
|
||||||
|
result.add(qa[i]);
|
||||||
|
}
|
||||||
|
Set<String> stringList = new HashSet<String>(luceneMorph.getMorhInfo(qa[0]));
|
||||||
|
assertThat(stringList, equalTo(result));
|
||||||
|
s = bufferedReader.readLine();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -0,0 +1,7 @@
|
|||||||
|
purchases purchas
|
||||||
|
existing exist
|
||||||
|
was be
|
||||||
|
men man
|
||||||
|
bore bore bear
|
||||||
|
grown grow grown
|
||||||
|
came come
|
@ -0,0 +1 @@
|
|||||||
|
following follow the instruction exactly will be help ensure the best well good result
|
@ -0,0 +1 @@
|
|||||||
|
Following the instructions exactly will help ensure the best results
|
@ -75,4 +75,8 @@ public class LuceneMorph extends Morph {
|
|||||||
}
|
}
|
||||||
return result.toArray(new Heuristic[result.size()]);
|
return result.toArray(new Heuristic[result.size()]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean checkString(String s) {
|
||||||
|
return decoderEncoder.checkString(s);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -53,9 +53,7 @@ public class MorphlogyFilter extends TokenFilter {
|
|||||||
|
|
||||||
Token nextToken = input.next(reusableToken);
|
Token nextToken = input.next(reusableToken);
|
||||||
if (nextToken == null) return null; // EOS; iterator exhausted
|
if (nextToken == null) return null; // EOS; iterator exhausted
|
||||||
Character testC = nextToken.term().charAt(0);
|
if (!luceneMorph.checkString(nextToken.term())) {
|
||||||
//todo check here for decoder endocoder
|
|
||||||
if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC) {
|
|
||||||
return nextToken;
|
return nextToken;
|
||||||
}
|
}
|
||||||
stack = luceneMorph.getMorhInfo(nextToken.term());
|
stack = luceneMorph.getMorhInfo(nextToken.term());
|
||||||
|
@ -0,0 +1,68 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.morphology.russian;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
|
import static org.junit.Assert.assertThat;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
|
|
||||||
|
public class RussianAnalayzerTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shoudGiveCorretWords() throws IOException {
|
||||||
|
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-answer.txt");
|
||||||
|
BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||||
|
String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
|
||||||
|
HashSet<String> answer = new HashSet<String>(Arrays.asList(strings));
|
||||||
|
stream.close();
|
||||||
|
|
||||||
|
RussianAnalayzer morphlogyAnalayzer = new RussianAnalayzer();
|
||||||
|
stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-data.txt");
|
||||||
|
|
||||||
|
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||||
|
final Token reusableToken = new Token();
|
||||||
|
|
||||||
|
Token nextToken;
|
||||||
|
TokenStream in = morphlogyAnalayzer.tokenStream(null, reader);
|
||||||
|
HashSet<String> result = new HashSet<String>();
|
||||||
|
for (; ;) {
|
||||||
|
nextToken = in.next(reusableToken);
|
||||||
|
|
||||||
|
if (nextToken == null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
result.add(nextToken.term());
|
||||||
|
//
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
stream.close();
|
||||||
|
|
||||||
|
assertThat(result, equalTo(answer));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1 @@
|
|||||||
|
в результат крушение погибнуть командир отряд специальный назначение пря при переть гувд ростовский область полковник милиция михаил перов и предприниматель
|
@ -0,0 +1 @@
|
|||||||
|
В результате крушения погибли командир отряда специального назначения при ГУВД Ростовской области полковник милиции Михаил Перов и предприниматель
|
Loading…
x
Reference in New Issue
Block a user