From 8c833132a87cdad75557bfefca6677f3f78268cf Mon Sep 17 00:00:00 2001
From: "alexander.a.kuznetsov"
 <alexander.a.kuznetsov@d817d54c-26ab-11de-abc9-2f7d1455ff7a>
Date: Sat, 17 Oct 2009 17:06:55 +0000
Subject: [PATCH] adding test for lucene analayzer fixed problem with string
 checking

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@62 d817d54c-26ab-11de-abc9-2f7d1455ff7a
---
 .../morphology/english/EnglishAnalayzer.java  | 53 ++++++++-------
 .../english/EnglishAnalayzerTest.java         | 65 ++++++++++++++++++
 .../english/EnglishLuceneMorphTest.java       | 26 +++++--
 .../english/english-morphology-test.txt       |  7 ++
 .../english/englsih-analayzer-answer.txt      |  1 +
 .../english/englsih-analayzer-data.txt        |  1 +
 .../apache/lucene/morphology/LuceneMorph.java |  4 ++
 .../morphology/analayzer/MorphlogyFilter.java |  4 +-
 .../russian/RussianAnalayzerTest.java         | 68 +++++++++++++++++++
 .../russian/russian-analayzer-answer.txt      |  1 +
 .../russian/russian-analayzer-data.txt        |  1 +
 11 files changed, 196 insertions(+), 35 deletions(-)
 rename russian/src/test/java/org/apache/lucene/morphology/russian/AnalayzerTest.java => english/src/main/java/org/apache/lucene/morphology/english/EnglishAnalayzer.java (59%)
 create mode 100644 english/src/test/java/org/apache/lucene/morphology/english/EnglishAnalayzerTest.java
 create mode 100644 english/src/test/resources/org/apache/lucene/morphology/english/english-morphology-test.txt
 create mode 100644 english/src/test/resources/org/apache/lucene/morphology/english/englsih-analayzer-answer.txt
 create mode 100644 english/src/test/resources/org/apache/lucene/morphology/english/englsih-analayzer-data.txt
 create mode 100644 russian/src/test/java/org/apache/lucene/morphology/russian/RussianAnalayzerTest.java
 create mode 100644 russian/src/test/resources/org/apache/lucene/morphology/russian/russian-analayzer-answer.txt
 create mode 100644 russian/src/test/resources/org/apache/lucene/morphology/russian/russian-analayzer-data.txt

diff --git a/russian/src/test/java/org/apache/lucene/morphology/russian/AnalayzerTest.java b/english/src/main/java/org/apache/lucene/morphology/english/EnglishAnalayzer.java
similarity index 59%
rename from russian/src/test/java/org/apache/lucene/morphology/russian/AnalayzerTest.java
rename to english/src/main/java/org/apache/lucene/morphology/english/EnglishAnalayzer.java
index a68ead1..bbf5330 100644
--- a/russian/src/test/java/org/apache/lucene/morphology/russian/AnalayzerTest.java
+++ b/english/src/main/java/org/apache/lucene/morphology/english/EnglishAnalayzer.java
@@ -1,26 +1,27 @@
-/**
- * Copyright 2009 Alexander Kuznetsov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.morphology.russian;
-
-import org.junit.Test;
-
-public class AnalayzerTest {
-
-    @Test
-    public void shoudGetCorrentTokens() {
-
-    }
-}
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology.english;
+
+import org.apache.lucene.morphology.analayzer.MorphlogyAnalayzer;
+
+import java.io.IOException;
+
+
+public class EnglishAnalayzer extends MorphlogyAnalayzer {
+    public EnglishAnalayzer() throws IOException {
+        super(EnglishAnalayzer.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
+    }
+}
\ No newline at end of file
diff --git a/english/src/test/java/org/apache/lucene/morphology/english/EnglishAnalayzerTest.java b/english/src/test/java/org/apache/lucene/morphology/english/EnglishAnalayzerTest.java
new file mode 100644
index 0000000..a95b9f2
--- /dev/null
+++ b/english/src/test/java/org/apache/lucene/morphology/english/EnglishAnalayzerTest.java
@@ -0,0 +1,65 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology.english;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import static org.hamcrest.Matchers.equalTo;
+import static org.junit.Assert.assertThat;
+import org.junit.Test;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.Arrays;
+import java.util.HashSet;
+
+
+public class EnglishAnalayzerTest {
+
+    @Test
+    public void shoudGiveCorretWords() throws IOException {
+        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-answer.txt");
+        BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
+        String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
+        HashSet<String> answer = new HashSet<String>(Arrays.asList(strings));
+        stream.close();
+
+        EnglishAnalayzer morphlogyAnalayzer = new EnglishAnalayzer();
+        stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-data.txt");
+
+        InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
+        final Token reusableToken = new Token();
+
+        Token nextToken;
+        TokenStream in = morphlogyAnalayzer.tokenStream(null, reader);
+        HashSet<String> result = new HashSet<String>();
+        for (; ;) {
+            nextToken = in.next(reusableToken);
+
+            if (nextToken == null) {
+                break;
+            }
+
+            result.add(nextToken.term());
+        }
+
+        stream.close();
+
+        assertThat(result, equalTo(answer));
+    }
+}
\ No newline at end of file
diff --git a/english/src/test/java/org/apache/lucene/morphology/english/EnglishLuceneMorphTest.java b/english/src/test/java/org/apache/lucene/morphology/english/EnglishLuceneMorphTest.java
index 2bc9ec7..d5c9601 100644
--- a/english/src/test/java/org/apache/lucene/morphology/english/EnglishLuceneMorphTest.java
+++ b/english/src/test/java/org/apache/lucene/morphology/english/EnglishLuceneMorphTest.java
@@ -16,10 +16,17 @@
 package org.apache.lucene.morphology.english;
 
 import org.apache.lucene.morphology.LuceneMorph;
+import static org.hamcrest.Matchers.equalTo;
+import static org.junit.Assert.assertThat;
 import org.junit.Before;
 import org.junit.Test;
 
+import java.io.BufferedReader;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.HashSet;
+import java.util.Set;
 
 public class EnglishLuceneMorphTest {
     private LuceneMorph luceneMorph;
@@ -31,11 +38,18 @@ public class EnglishLuceneMorphTest {
 
     @Test
     public void shoudGetCorrentMorphInfo() throws IOException {
-        System.out.println(luceneMorph.getMorhInfo("purchases"));
-        System.out.println(luceneMorph.getMorhInfo("existing"));
-        System.out.println(luceneMorph.getMorhInfo("was"));
-        System.out.println(luceneMorph.getMorhInfo("men"));
-        System.out.println(luceneMorph.getMorhInfo("bore"));
-        System.out.println(luceneMorph.getMorhInfo("came"));
+        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/english-morphology-test.txt");
+        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
+        String s = bufferedReader.readLine();
+        while (s != null) {
+            String[] qa = s.trim().split(" ");
+            Set<String> result = new HashSet<String>();
+            for (int i = 1; i < qa.length; i++) {
+                result.add(qa[i]);
+            }
+            Set<String> stringList = new HashSet<String>(luceneMorph.getMorhInfo(qa[0]));
+            assertThat(stringList, equalTo(result));
+            s = bufferedReader.readLine();
+        }
     }
 }
\ No newline at end of file
diff --git a/english/src/test/resources/org/apache/lucene/morphology/english/english-morphology-test.txt b/english/src/test/resources/org/apache/lucene/morphology/english/english-morphology-test.txt
new file mode 100644
index 0000000..196524b
--- /dev/null
+++ b/english/src/test/resources/org/apache/lucene/morphology/english/english-morphology-test.txt
@@ -0,0 +1,7 @@
+purchases purchas
+existing exist
+was be
+men man
+bore bore bear
+grown grow grown
+came come
\ No newline at end of file
diff --git a/english/src/test/resources/org/apache/lucene/morphology/english/englsih-analayzer-answer.txt b/english/src/test/resources/org/apache/lucene/morphology/english/englsih-analayzer-answer.txt
new file mode 100644
index 0000000..cffa6be
--- /dev/null
+++ b/english/src/test/resources/org/apache/lucene/morphology/english/englsih-analayzer-answer.txt
@@ -0,0 +1 @@
+following follow the instruction exactly will be help ensure the best well good result
\ No newline at end of file
diff --git a/english/src/test/resources/org/apache/lucene/morphology/english/englsih-analayzer-data.txt b/english/src/test/resources/org/apache/lucene/morphology/english/englsih-analayzer-data.txt
new file mode 100644
index 0000000..5c203f8
--- /dev/null
+++ b/english/src/test/resources/org/apache/lucene/morphology/english/englsih-analayzer-data.txt
@@ -0,0 +1 @@
+Following the instructions exactly will help ensure the best results
\ No newline at end of file
diff --git a/morph/src/main/java/org/apache/lucene/morphology/LuceneMorph.java b/morph/src/main/java/org/apache/lucene/morphology/LuceneMorph.java
index bed0875..86e2db5 100644
--- a/morph/src/main/java/org/apache/lucene/morphology/LuceneMorph.java
+++ b/morph/src/main/java/org/apache/lucene/morphology/LuceneMorph.java
@@ -75,4 +75,8 @@ public class LuceneMorph extends Morph {
         }
         return result.toArray(new Heuristic[result.size()]);
     }
+
+    public boolean checkString(String s) {
+        return decoderEncoder.checkString(s);
+    }
 }
diff --git a/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyFilter.java b/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyFilter.java
index 4f75ad5..251b6fc 100644
--- a/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyFilter.java
+++ b/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyFilter.java
@@ -53,9 +53,7 @@ public class MorphlogyFilter extends TokenFilter {
 
         Token nextToken = input.next(reusableToken);
         if (nextToken == null) return null; // EOS; iterator exhausted
-        Character testC = nextToken.term().charAt(0);
-        //todo check here for decoder endocoder
-        if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC) {
+        if (!luceneMorph.checkString(nextToken.term())) {
             return nextToken;
         }
         stack = luceneMorph.getMorhInfo(nextToken.term());
diff --git a/russian/src/test/java/org/apache/lucene/morphology/russian/RussianAnalayzerTest.java b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianAnalayzerTest.java
new file mode 100644
index 0000000..2982de6
--- /dev/null
+++ b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianAnalayzerTest.java
@@ -0,0 +1,68 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology.russian;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import static org.hamcrest.Matchers.equalTo;
+import static org.junit.Assert.assertThat;
+import org.junit.Test;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.Arrays;
+import java.util.HashSet;
+
+
+public class RussianAnalayzerTest {
+
+    @Test
+    public void shoudGiveCorretWords() throws IOException {
+        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-answer.txt");
+        BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
+        String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
+        HashSet<String> answer = new HashSet<String>(Arrays.asList(strings));
+        stream.close();
+
+        RussianAnalayzer morphlogyAnalayzer = new RussianAnalayzer();
+        stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-data.txt");
+
+        InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
+        final Token reusableToken = new Token();
+
+        Token nextToken;
+        TokenStream in = morphlogyAnalayzer.tokenStream(null, reader);
+        HashSet<String> result = new HashSet<String>();
+        for (; ;) {
+            nextToken = in.next(reusableToken);
+
+            if (nextToken == null) {
+                break;
+            }
+
+            result.add(nextToken.term());
+            //
+
+        }
+
+        stream.close();
+
+        assertThat(result, equalTo(answer));
+    }
+}
+
diff --git a/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-analayzer-answer.txt b/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-analayzer-answer.txt
new file mode 100644
index 0000000..44b1843
--- /dev/null
+++ b/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-analayzer-answer.txt
@@ -0,0 +1 @@
+в результат крушение погибнуть командир отряд специальный назначение пря при переть гувд ростовский область полковник милиция михаил перов и предприниматель
\ No newline at end of file
diff --git a/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-analayzer-data.txt b/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-analayzer-data.txt
new file mode 100644
index 0000000..c97b5e9
--- /dev/null
+++ b/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-analayzer-data.txt
@@ -0,0 +1 @@
+В результате крушения погибли командир отряда специального назначения при ГУВД Ростовской области полковник милиции Михаил Перов и предприниматель
\ No newline at end of file