adding test for lucene analayzer

fixed problem with string checking git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@62 d817d54c-26ab-11de-abc9-2f7d1455ff7a
2009-10-17 17:06:55 +00:00
parent 26ca704ec0
commit 8c833132a8
11 changed files with 196 additions and 35 deletions
@@ -13,14 +13,15 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-package org.apache.lucene.morphology.russian;
+package org.apache.lucene.morphology.english;
-import org.junit.Test;
+import org.apache.lucene.morphology.analayzer.MorphlogyAnalayzer;
-public class AnalayzerTest {
+import java.io.IOException;
    @Test
    public void shoudGetCorrentTokens() {
 public class EnglishAnalayzer extends MorphlogyAnalayzer {
    public EnglishAnalayzer() throws IOException {
        super(EnglishAnalayzer.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
    }
 }
@@ -0,0 +1,65 @@
 /**
 * Copyright 2009 Alexander Kuznetsov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.morphology.english;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import static org.hamcrest.Matchers.equalTo;
 import static org.junit.Assert.assertThat;
 import org.junit.Test;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.util.Arrays;
 import java.util.HashSet;
 public class EnglishAnalayzerTest {
    @Test
    public void shoudGiveCorretWords() throws IOException {
        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-answer.txt");
        BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
        String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
        HashSet<String> answer = new HashSet<String>(Arrays.asList(strings));
        stream.close();
        EnglishAnalayzer morphlogyAnalayzer = new EnglishAnalayzer();
        stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-data.txt");
        InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
        final Token reusableToken = new Token();
        Token nextToken;
        TokenStream in = morphlogyAnalayzer.tokenStream(null, reader);
        HashSet<String> result = new HashSet<String>();
        for (; ;) {
            nextToken = in.next(reusableToken);
            if (nextToken == null) {
                break;
            }
            result.add(nextToken.term());
        }
        stream.close();
        assertThat(result, equalTo(answer));
    }
 }
@@ -16,10 +16,17 @@
 package org.apache.lucene.morphology.english;
 import org.apache.lucene.morphology.LuceneMorph;
 import static org.hamcrest.Matchers.equalTo;
 import static org.junit.Assert.assertThat;
 import org.junit.Before;
 import org.junit.Test;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.util.HashSet;
 import java.util.Set;
 public class EnglishLuceneMorphTest {
    private LuceneMorph luceneMorph;
@@ -31,11 +38,18 @@ public class EnglishLuceneMorphTest {
    @Test
    public void shoudGetCorrentMorphInfo() throws IOException {
-        System.out.println(luceneMorph.getMorhInfo("purchases"));
+        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/english-morphology-test.txt");
-        System.out.println(luceneMorph.getMorhInfo("existing"));
+        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
-        System.out.println(luceneMorph.getMorhInfo("was"));
+        String s = bufferedReader.readLine();
-        System.out.println(luceneMorph.getMorhInfo("men"));
+        while (s != null) {
-        System.out.println(luceneMorph.getMorhInfo("bore"));
+            String[] qa = s.trim().split(" ");
-        System.out.println(luceneMorph.getMorhInfo("came"));
+            Set<String> result = new HashSet<String>();
            for (int i = 1; i < qa.length; i++) {
                result.add(qa[i]);
            }
            Set<String> stringList = new HashSet<String>(luceneMorph.getMorhInfo(qa[0]));
            assertThat(stringList, equalTo(result));
            s = bufferedReader.readLine();
        }
    }
 }
@@ -0,0 +1,7 @@
 purchases purchas
 existing exist
 was be
 men man
 bore bore bear
 grown grow grown
 came come
@@ -0,0 +1 @@
 following follow the instruction exactly will be help ensure the best well good result
@@ -0,0 +1 @@
 Following the instructions exactly will help ensure the best results
@@ -75,4 +75,8 @@ public class LuceneMorph extends Morph {
        }
        return result.toArray(new Heuristic[result.size()]);
    }
    public boolean checkString(String s) {
        return decoderEncoder.checkString(s);
    }
 }
@@ -53,9 +53,7 @@ public class MorphlogyFilter extends TokenFilter {
        Token nextToken = input.next(reusableToken);
        if (nextToken == null) return null; // EOS; iterator exhausted
-        Character testC = nextToken.term().charAt(0);
+        if (!luceneMorph.checkString(nextToken.term())) {
        //todo check here for decoder endocoder
        if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC) {
            return nextToken;
        }
        stack = luceneMorph.getMorhInfo(nextToken.term());
@@ -0,0 +1,68 @@
 /**
 * Copyright 2009 Alexander Kuznetsov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.morphology.russian;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import static org.hamcrest.Matchers.equalTo;
 import static org.junit.Assert.assertThat;
 import org.junit.Test;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.util.Arrays;
 import java.util.HashSet;
 public class RussianAnalayzerTest {
    @Test
    public void shoudGiveCorretWords() throws IOException {
        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-answer.txt");
        BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
        String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
        HashSet<String> answer = new HashSet<String>(Arrays.asList(strings));
        stream.close();
        RussianAnalayzer morphlogyAnalayzer = new RussianAnalayzer();
        stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-data.txt");
        InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
        final Token reusableToken = new Token();
        Token nextToken;
        TokenStream in = morphlogyAnalayzer.tokenStream(null, reader);
        HashSet<String> result = new HashSet<String>();
        for (; ;) {
            nextToken = in.next(reusableToken);
            if (nextToken == null) {
                break;
            }
            result.add(nextToken.term());
            //
        }
        stream.close();
        assertThat(result, equalTo(answer));
    }
 }
@@ -0,0 +1 @@
 в результат крушение погибнуть командир отряд специальный назначение пря при переть гувд ростовский область полковник милиция михаил перов и предприниматель
@@ -0,0 +1 @@
 В результате крушения погибли командир отряда специального назначения при ГУВД Ростовской области полковник милиции Михаил Перов и предприниматель
		`@@ -0,0 +1 @@`
							`following follow the instruction exactly will be help ensure the best well good result`
		`@@ -0,0 +1 @@`
							`Following the instructions exactly will help ensure the best results`
		`@@ -0,0 +1 @@`
							`в результат крушение погибнуть командир отряд специальный назначение пря при переть гувд ростовский область полковник милиция михаил перов и предприниматель`
		`@@ -0,0 +1 @@`
							`В результате крушения погибли командир отряда специального назначения при ГУВД Ростовской области полковник милиции Михаил Перов и предприниматель`