From 8c833132a87cdad75557bfefca6677f3f78268cf Mon Sep 17 00:00:00 2001 From: "alexander.a.kuznetsov" Date: Sat, 17 Oct 2009 17:06:55 +0000 Subject: [PATCH] adding test for lucene analayzer fixed problem with string checking git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@62 d817d54c-26ab-11de-abc9-2f7d1455ff7a --- .../morphology/english/EnglishAnalayzer.java | 53 ++++++++------- .../english/EnglishAnalayzerTest.java | 65 ++++++++++++++++++ .../english/EnglishLuceneMorphTest.java | 26 +++++-- .../english/english-morphology-test.txt | 7 ++ .../english/englsih-analayzer-answer.txt | 1 + .../english/englsih-analayzer-data.txt | 1 + .../apache/lucene/morphology/LuceneMorph.java | 4 ++ .../morphology/analayzer/MorphlogyFilter.java | 4 +- .../russian/RussianAnalayzerTest.java | 68 +++++++++++++++++++ .../russian/russian-analayzer-answer.txt | 1 + .../russian/russian-analayzer-data.txt | 1 + 11 files changed, 196 insertions(+), 35 deletions(-) rename russian/src/test/java/org/apache/lucene/morphology/russian/AnalayzerTest.java => english/src/main/java/org/apache/lucene/morphology/english/EnglishAnalayzer.java (59%) create mode 100644 english/src/test/java/org/apache/lucene/morphology/english/EnglishAnalayzerTest.java create mode 100644 english/src/test/resources/org/apache/lucene/morphology/english/english-morphology-test.txt create mode 100644 english/src/test/resources/org/apache/lucene/morphology/english/englsih-analayzer-answer.txt create mode 100644 english/src/test/resources/org/apache/lucene/morphology/english/englsih-analayzer-data.txt create mode 100644 russian/src/test/java/org/apache/lucene/morphology/russian/RussianAnalayzerTest.java create mode 100644 russian/src/test/resources/org/apache/lucene/morphology/russian/russian-analayzer-answer.txt create mode 100644 russian/src/test/resources/org/apache/lucene/morphology/russian/russian-analayzer-data.txt diff --git a/russian/src/test/java/org/apache/lucene/morphology/russian/AnalayzerTest.java b/english/src/main/java/org/apache/lucene/morphology/english/EnglishAnalayzer.java similarity index 59% rename from russian/src/test/java/org/apache/lucene/morphology/russian/AnalayzerTest.java rename to english/src/main/java/org/apache/lucene/morphology/english/EnglishAnalayzer.java index a68ead1..bbf5330 100644 --- a/russian/src/test/java/org/apache/lucene/morphology/russian/AnalayzerTest.java +++ b/english/src/main/java/org/apache/lucene/morphology/english/EnglishAnalayzer.java @@ -1,26 +1,27 @@ -/** - * Copyright 2009 Alexander Kuznetsov - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.morphology.russian; - -import org.junit.Test; - -public class AnalayzerTest { - - @Test - public void shoudGetCorrentTokens() { - - } -} +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.morphology.english; + +import org.apache.lucene.morphology.analayzer.MorphlogyAnalayzer; + +import java.io.IOException; + + +public class EnglishAnalayzer extends MorphlogyAnalayzer { + public EnglishAnalayzer() throws IOException { + super(EnglishAnalayzer.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder()); + } +} \ No newline at end of file diff --git a/english/src/test/java/org/apache/lucene/morphology/english/EnglishAnalayzerTest.java b/english/src/test/java/org/apache/lucene/morphology/english/EnglishAnalayzerTest.java new file mode 100644 index 0000000..a95b9f2 --- /dev/null +++ b/english/src/test/java/org/apache/lucene/morphology/english/EnglishAnalayzerTest.java @@ -0,0 +1,65 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.morphology.english; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import static org.hamcrest.Matchers.equalTo; +import static org.junit.Assert.assertThat; +import org.junit.Test; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.Arrays; +import java.util.HashSet; + + +public class EnglishAnalayzerTest { + + @Test + public void shoudGiveCorretWords() throws IOException { + InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-answer.txt"); + BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); + String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" "); + HashSet answer = new HashSet(Arrays.asList(strings)); + stream.close(); + + EnglishAnalayzer morphlogyAnalayzer = new EnglishAnalayzer(); + stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-data.txt"); + + InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); + final Token reusableToken = new Token(); + + Token nextToken; + TokenStream in = morphlogyAnalayzer.tokenStream(null, reader); + HashSet result = new HashSet(); + for (; ;) { + nextToken = in.next(reusableToken); + + if (nextToken == null) { + break; + } + + result.add(nextToken.term()); + } + + stream.close(); + + assertThat(result, equalTo(answer)); + } +} \ No newline at end of file diff --git a/english/src/test/java/org/apache/lucene/morphology/english/EnglishLuceneMorphTest.java b/english/src/test/java/org/apache/lucene/morphology/english/EnglishLuceneMorphTest.java index 2bc9ec7..d5c9601 100644 --- a/english/src/test/java/org/apache/lucene/morphology/english/EnglishLuceneMorphTest.java +++ b/english/src/test/java/org/apache/lucene/morphology/english/EnglishLuceneMorphTest.java @@ -16,10 +16,17 @@ package org.apache.lucene.morphology.english; import org.apache.lucene.morphology.LuceneMorph; +import static org.hamcrest.Matchers.equalTo; +import static org.junit.Assert.assertThat; import org.junit.Before; import org.junit.Test; +import java.io.BufferedReader; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.HashSet; +import java.util.Set; public class EnglishLuceneMorphTest { private LuceneMorph luceneMorph; @@ -31,11 +38,18 @@ public class EnglishLuceneMorphTest { @Test public void shoudGetCorrentMorphInfo() throws IOException { - System.out.println(luceneMorph.getMorhInfo("purchases")); - System.out.println(luceneMorph.getMorhInfo("existing")); - System.out.println(luceneMorph.getMorhInfo("was")); - System.out.println(luceneMorph.getMorhInfo("men")); - System.out.println(luceneMorph.getMorhInfo("bore")); - System.out.println(luceneMorph.getMorhInfo("came")); + InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/english-morphology-test.txt"); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); + String s = bufferedReader.readLine(); + while (s != null) { + String[] qa = s.trim().split(" "); + Set result = new HashSet(); + for (int i = 1; i < qa.length; i++) { + result.add(qa[i]); + } + Set stringList = new HashSet(luceneMorph.getMorhInfo(qa[0])); + assertThat(stringList, equalTo(result)); + s = bufferedReader.readLine(); + } } } \ No newline at end of file diff --git a/english/src/test/resources/org/apache/lucene/morphology/english/english-morphology-test.txt b/english/src/test/resources/org/apache/lucene/morphology/english/english-morphology-test.txt new file mode 100644 index 0000000..196524b --- /dev/null +++ b/english/src/test/resources/org/apache/lucene/morphology/english/english-morphology-test.txt @@ -0,0 +1,7 @@ +purchases purchas +existing exist +was be +men man +bore bore bear +grown grow grown +came come \ No newline at end of file diff --git a/english/src/test/resources/org/apache/lucene/morphology/english/englsih-analayzer-answer.txt b/english/src/test/resources/org/apache/lucene/morphology/english/englsih-analayzer-answer.txt new file mode 100644 index 0000000..cffa6be --- /dev/null +++ b/english/src/test/resources/org/apache/lucene/morphology/english/englsih-analayzer-answer.txt @@ -0,0 +1 @@ +following follow the instruction exactly will be help ensure the best well good result \ No newline at end of file diff --git a/english/src/test/resources/org/apache/lucene/morphology/english/englsih-analayzer-data.txt b/english/src/test/resources/org/apache/lucene/morphology/english/englsih-analayzer-data.txt new file mode 100644 index 0000000..5c203f8 --- /dev/null +++ b/english/src/test/resources/org/apache/lucene/morphology/english/englsih-analayzer-data.txt @@ -0,0 +1 @@ +Following the instructions exactly will help ensure the best results \ No newline at end of file diff --git a/morph/src/main/java/org/apache/lucene/morphology/LuceneMorph.java b/morph/src/main/java/org/apache/lucene/morphology/LuceneMorph.java index bed0875..86e2db5 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/LuceneMorph.java +++ b/morph/src/main/java/org/apache/lucene/morphology/LuceneMorph.java @@ -75,4 +75,8 @@ public class LuceneMorph extends Morph { } return result.toArray(new Heuristic[result.size()]); } + + public boolean checkString(String s) { + return decoderEncoder.checkString(s); + } } diff --git a/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyFilter.java b/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyFilter.java index 4f75ad5..251b6fc 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyFilter.java +++ b/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyFilter.java @@ -53,9 +53,7 @@ public class MorphlogyFilter extends TokenFilter { Token nextToken = input.next(reusableToken); if (nextToken == null) return null; // EOS; iterator exhausted - Character testC = nextToken.term().charAt(0); - //todo check here for decoder endocoder - if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC) { + if (!luceneMorph.checkString(nextToken.term())) { return nextToken; } stack = luceneMorph.getMorhInfo(nextToken.term()); diff --git a/russian/src/test/java/org/apache/lucene/morphology/russian/RussianAnalayzerTest.java b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianAnalayzerTest.java new file mode 100644 index 0000000..2982de6 --- /dev/null +++ b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianAnalayzerTest.java @@ -0,0 +1,68 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.morphology.russian; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import static org.hamcrest.Matchers.equalTo; +import static org.junit.Assert.assertThat; +import org.junit.Test; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.Arrays; +import java.util.HashSet; + + +public class RussianAnalayzerTest { + + @Test + public void shoudGiveCorretWords() throws IOException { + InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-answer.txt"); + BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); + String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" "); + HashSet answer = new HashSet(Arrays.asList(strings)); + stream.close(); + + RussianAnalayzer morphlogyAnalayzer = new RussianAnalayzer(); + stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-data.txt"); + + InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); + final Token reusableToken = new Token(); + + Token nextToken; + TokenStream in = morphlogyAnalayzer.tokenStream(null, reader); + HashSet result = new HashSet(); + for (; ;) { + nextToken = in.next(reusableToken); + + if (nextToken == null) { + break; + } + + result.add(nextToken.term()); + // + + } + + stream.close(); + + assertThat(result, equalTo(answer)); + } +} + diff --git a/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-analayzer-answer.txt b/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-analayzer-answer.txt new file mode 100644 index 0000000..44b1843 --- /dev/null +++ b/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-analayzer-answer.txt @@ -0,0 +1 @@ +в результат крушение погибнуть командир отряд специальный назначение пря при переть гувд ростовский область полковник милиция михаил перов и предприниматель \ No newline at end of file diff --git a/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-analayzer-data.txt b/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-analayzer-data.txt new file mode 100644 index 0000000..c97b5e9 --- /dev/null +++ b/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-analayzer-data.txt @@ -0,0 +1 @@ +В результате крушения погибли командир отряда специального назначения при ГУВД Ростовской области полковник милиции Михаил Перов и предприниматель \ No newline at end of file