adding test for all words, refactors test and dictonary reading

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@99 d817d54c-26ab-11de-abc9-2f7d1455ff7a
2010-10-08 11:35:13 +00:00
parent e8399999c3
commit 76e68a11e0
26 changed files with 730 additions and 48 deletions
--- a/dictionary-reader/pom.xml
+++ b/dictionary-reader/pom.xml
@@ -1,5 +1,6 @@
 <?xml version="1.0"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
    <parent>
        <artifactId>morphology</artifactId>
        <groupId>org.apache.lucene.morphology</groupId>
@@ -26,4 +27,6 @@
            <version>0.9-SNAPSHOT</version>
        </dependency>
    </dependencies>
+
+
 </project>
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictionaryReader.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictionaryReader.java
@@ -34,16 +34,19 @@ public class DictionaryReader {
    private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
    private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
    private Set<String> ignoredForm = new HashSet<String>();
+    private List<WordFilter> filters = new ArrayList<WordFilter>();

-    public DictionaryReader(String fileName, Set<String> ignoredForm) {
+    public DictionaryReader(String fileName, Set<String> ignoredForm, List<WordFilter> filters) {
        this.fileName = fileName;
        this.ignoredForm = ignoredForm;
+        this.filters = filters;
    }

-    public DictionaryReader(String fileName, String fileEncoding, Set<String> ignoredForm) {
+    public DictionaryReader(String fileName, String fileEncoding, Set<String> ignoredForm, List<WordFilter> filters) {
        this.fileName = fileName;
        this.fileEncoding = fileEncoding;
        this.ignoredForm = ignoredForm;
+        this.filters = filters;
    }


@@ -60,30 +63,46 @@ public class DictionaryReader {
    private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
        String s = reader.readLine();
        int count = Integer.valueOf(s);
+        int actual = 0;
        for (int i = 0; i < count; i++) {
            s = reader.readLine();
            if (i % 10000 == 0) System.out.println("Proccess " + i + " wordBase of " + count);

+            WordCard card = buildForm(s);
+
+            for (WordFilter wf : filters) {
+                if (card == null) break;
+                card = wf.transform(card);
+            }
+
+            if (card == null) {
+                continue;
+            }
+
+            wordProccessor.process(card);
+            actual++;
+
+        }
+        System.out.println("Finished word processing actual words " + actual);
+    }
+
+    private WordCard buildForm(String s) {
        String[] wd = s.split(" ");
        String wordBase = wd[0].toLowerCase();
-            if (wordBase.startsWith("-")) continue;
+        if (wordBase.startsWith("-")) return null;
        wordBase = "#".equals(wordBase) ? "" : wordBase;
        List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
        FlexiaModel flexiaModel = models.get(0);
-            if (models.size() > 0 && !ignoredForm.contains(flexiaModel.getCode())) {
+        if (models.size() == 0 || ignoredForm.contains(flexiaModel.getCode())) {
+            return null;
+        }

        WordCard card = new WordCard(flexiaModel.create(wordBase), wordBase, flexiaModel.getSuffix());
+
        for (FlexiaModel fm : models) {
            card.addFlexia(fm);
        }
-//                if(card.getBase().equals("face") || card.getBase().equals("fac")){
-//                    System.out.println(models);
-//                    System.out.println(card);
-                    wordProccessor.process(card);
-                //}
-
-            }
-        }
+        return card;
    }


@@ -122,7 +141,7 @@ public class DictionaryReader {
        String[] fl = line.split("\\*");
        // we inored all forms thats
        if (fl.length == 3) {
-            System.out.println(line);
+            //System.out.println(line);
            // flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase())));
        }
        if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java
@@ -66,4 +66,26 @@ public class FlexiaModel {
                ", prefix='" + prefix + '\'' +
                '}';
    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        FlexiaModel that = (FlexiaModel) o;
+
+        if (code != null ? !code.equals(that.code) : that.code != null) return false;
+        if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false;
+        if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false;
+
+        return true;
+    }
+
+    @Override
+    public int hashCode() {
+        int result = code != null ? code.hashCode() : 0;
+        result = 31 * result + (suffix != null ? suffix.hashCode() : 0);
+        result = 31 * result + (prefix != null ? prefix.hashCode() : 0);
+        return result;
+    }
 }
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/StatisticsCollector.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/StatisticsCollector.java
@@ -42,12 +42,8 @@ public class StatisticsCollector implements WordProccessor {
    public void process(WordCard wordCard) throws IOException {
        cleanWordCard(wordCard);
        String normalStringMorph = wordCard.getWordsForms().get(0).getCode();
-        String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
-        if (word.contains("-")) return;
-        if (!decoderEncoder.checkString(word)) return;

        for (FlexiaModel fm : wordCard.getWordsForms()) {
-            if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) continue;
            Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph);
            String form = revertWord(fm.create(wordCard.getBase()));
            Set<Heuristic> suffixHeuristics = inverseIndex.get(form);
@@ -138,8 +134,8 @@ public class StatisticsCollector implements WordProccessor {
        Integer length = getCommonLength(form, normalForm);
        Integer actualSuffixLengh = form.length() - length;
        String actualNormalSuffix = normalForm.substring(length);
-        Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode().substring(0, 2));
-        Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm.substring(0, 2));
+        Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode());
+        Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm);
        return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue());
    }

--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCard.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCard.java
@@ -38,6 +38,10 @@ public class WordCard {
        wordsForms.add(flexiaModel);
    }

+    public void removeFlexia(FlexiaModel flexiaModel) {
+        wordsForms.remove(flexiaModel);
+    }
+
    public String getCanonicalForm() {
        return canonicalForm;
    }
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCleaner.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCleaner.java
@@ -0,0 +1,50 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology.dictionary;
+
+import org.apache.lucene.morphology.LetterDecoderEncoder;
+
+import java.util.LinkedList;
+import java.util.List;
+
+
+public class WordCleaner implements WordFilter {
+
+    private LetterDecoderEncoder decoderEncoder;
+
+    public WordCleaner(LetterDecoderEncoder decoderEncoder) {
+        this.decoderEncoder = decoderEncoder;
+    }
+
+    public WordCard transform(WordCard wordCard) {
+        String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
+
+        if (word.contains("-")) return null;
+        if (!decoderEncoder.checkString(word)) return null;
+
+        List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>();
+        for (FlexiaModel fm : wordCard.getWordsForms()) {
+            if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) {
+                flexiaModelsToRemove.add(fm);
+            }
+        }
+        for (FlexiaModel fm : flexiaModelsToRemove) {
+            wordCard.removeFlexia(fm);
+        }
+
+        return wordCard;
+    }
+}
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordFilter.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordFilter.java
@@ -0,0 +1,24 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.morphology.dictionary;
+
+
+public interface WordFilter {
+
+    public WordCard transform(WordCard wordCard);
+
+}
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordStringCleaner.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordStringCleaner.java
@@ -0,0 +1,49 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology.dictionary;
+
+import org.apache.lucene.morphology.LetterDecoderEncoder;
+
+import java.util.List;
+
+
+public class WordStringCleaner implements WordFilter {
+
+    private LetterDecoderEncoder decoderEncoder;
+
+    public WordStringCleaner(LetterDecoderEncoder decoderEncoder) {
+        this.decoderEncoder = decoderEncoder;
+    }
+
+    public WordCard transform(WordCard wordCard) {
+        wordCard.setBase(cleanString(wordCard.getBase()));
+        wordCard.setCanonicalForm(cleanString(wordCard.getCanonicalForm()));
+        wordCard.setCanonicalSuffix(cleanString(wordCard.getCanonicalSuffix()));
+        List<FlexiaModel> models = wordCard.getWordsForms();
+        for (FlexiaModel m : models) {
+            m.setSuffix(cleanString(m.getSuffix()));
+            m.setPrefix(cleanString(m.getPrefix()));
+            //made correct code
+            m.setCode(m.getCode().substring(0, 2));
+        }
+        return wordCard;
+    }
+
+
+    private String cleanString(String s) {
+        return decoderEncoder.cleanString(s);
+    }
+}
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java
@@ -16,22 +16,24 @@

 package org.apache.lucene.morphology.generator;

-import org.apache.lucene.morphology.dictionary.DictionaryReader;
-import org.apache.lucene.morphology.dictionary.GrammaReader;
-import org.apache.lucene.morphology.dictionary.StatisticsCollector;
-import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder;
+import org.apache.lucene.morphology.EnglishLetterDecoderEncoder;
+import org.apache.lucene.morphology.dictionary.*;

 import java.io.IOException;
+import java.util.Arrays;
 import java.util.HashSet;
+import java.util.List;


 public class EnglishHeuristicBuilder {
    public static void main(String[] args) throws IOException {

        GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/egramtab.tab");
-        DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());
-
        EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
+        List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
+
+        DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>(), filters);
+
        StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
        dictionaryReader.proccess(statisticsCollector);
        statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java
@@ -16,21 +16,23 @@

 package org.apache.lucene.morphology.generator;

-import org.apache.lucene.morphology.dictionary.DictionaryReader;
-import org.apache.lucene.morphology.dictionary.GrammaReader;
-import org.apache.lucene.morphology.dictionary.StatisticsCollector;
+import org.apache.lucene.morphology.dictionary.*;
 import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;

 import java.io.IOException;
+import java.util.Arrays;
 import java.util.HashSet;
+import java.util.List;


 public class RussianHeuristicBuilder {
    public static void main(String[] args) throws IOException {
        GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
-        DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
-
        RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
+        List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
+
+        DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>(), filters);
+
        StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
        dictionaryReader.proccess(statisticsCollector);
        statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");
--- a/dictionary-reader/src/test/java/org/apache/lucene/TestAllWords.java
+++ b/dictionary-reader/src/test/java/org/apache/lucene/TestAllWords.java
@@ -0,0 +1,144 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene;
+
+import org.apache.lucene.morphology.*;
+import org.apache.lucene.morphology.dictionary.*;
+import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
+import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
+import org.apache.lucene.morphology.russian.RussianMorphology;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicLong;
+
+import static org.hamcrest.Matchers.hasItem;
+import static org.junit.Assert.assertThat;
+
+
+public class TestAllWords {
+
+    String prefix = "";
+
+    @Before
+    public void setUp() {
+        System.out.println(System.getProperty("user.dir"));
+        prefix = System.getProperty("user.dir").endsWith("dictionary-reader") ? "../" : "";
+
+    }
+
+    @Test
+    public void shouldEnglishMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException {
+        final Morphology morphology = new EnglishMorphology();
+        LetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
+        String pathToGramma = prefix + "dictonary/Dicts/Morph/egramtab.tab";
+        String pathToDict = prefix + "dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd";
+
+        testFullGramma(morphology, decoderEncoder, pathToGramma, pathToDict);
+
+    }
+
+    @Test
+    public void shouldRussianMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException {
+        final Morphology morphology = new RussianMorphology();
+        LetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
+        String pathToGramma = prefix + "dictonary/Dicts/Morph/rgramtab.tab";
+        String pathToDict = prefix + "dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd";
+
+        testFullGramma(morphology, decoderEncoder, pathToGramma, pathToDict);
+    }
+
+    private void testFullGramma(final Morphology morphology, LetterDecoderEncoder decoderEncoder, String pathToGramma, String pathToDict) throws IOException {
+        GrammaReader grammaInfo = new GrammaReader(pathToGramma);
+        final List<String> morphInfo = grammaInfo.getGrammaInfo();
+        final Map<String, Integer> inversIndex = grammaInfo.getGrammInversIndex();
+
+        List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
+
+
+        DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet<String>(), filters);
+
+        final AtomicLong wordCount = new AtomicLong(0);
+        Long startTime = System.currentTimeMillis();
+
+        dictionaryReader.proccess(new WordProccessor() {
+            public void process(WordCard wordCard) throws IOException {
+                String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
+                for (FlexiaModel fm : wordCard.getWordsForms()) {
+                    String wordForm = wordCard.getBase() + fm.getSuffix();
+                    String morph = morphInfo.get(inversIndex.get(fm.getCode()));
+                    assertThat(morphology.getMorphInfo(wordForm), hasItem(word + "|" + morph));
+                    assertThat(morphology.getNormalForms(wordForm), hasItem(word));
+                    wordCount.set(2L + wordCount.get());
+                }
+            }
+        });
+
+        long time = System.currentTimeMillis() - startTime;
+        System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second");
+    }
+
+    @Test
+    public void shouldEnglishLuceneMorphologyIncludeAllWords() throws IOException {
+        final LuceneMorphology morphology = new EnglishLuceneMorphology();
+
+        LetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
+        List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
+        String pathToDic = prefix + "dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd";
+
+        testAllWordForLucene(morphology, filters, pathToDic);
+    }
+
+    @Test
+    public void shouldIncludeAllWordsRussianInLuceneMorophology() throws IOException {
+        final LuceneMorphology morphology = new RussianLuceneMorphology();
+
+        LetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
+        List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
+
+        String pathToDic = prefix + "dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd";
+
+        testAllWordForLucene(morphology, filters, pathToDic);
+
+    }
+
+    private void testAllWordForLucene(final LuceneMorphology morphology, List<WordFilter> filters, String pathToDic) throws IOException {
+        final AtomicLong wordCount = new AtomicLong(0);
+        Long startTime = System.currentTimeMillis();
+
+        DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<String>(), filters);
+        dictionaryReader.proccess(new WordProccessor() {
+            public void process(WordCard wordCard) throws IOException {
+                String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
+                for (FlexiaModel fm : wordCard.getWordsForms()) {
+                    String wordForm = wordCard.getBase() + fm.getSuffix();
+                    assertThat(morphology.getNormalForms(wordForm), hasItem(word));
+                    wordCount.set(1L + wordCount.get());
+                }
+            }
+        });
+
+        long time = System.currentTimeMillis() - startTime;
+        System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second");
+    }
+
+
+}
--- a/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalayzersTest.java
+++ b/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalayzersTest.java
@@ -0,0 +1,77 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.morphology.russian.RussianAnalyzer;
+import org.junit.Test;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import static org.hamcrest.Matchers.equalTo;
+import static org.junit.Assert.assertThat;
+
+
+public class AnalayzersTest {
+
+    @Test
+    public void englishAnalyzerShouldGiveCorrectWords() throws IOException {
+        Analyzer morphlogyAnalyzer = new EnglishAnalyzer();
+        String answerPath = "/english/englsih-analayzer-answer.txt";
+        String testPath = "/english/englsih-analayzer-data.txt";
+
+        testAnalayzer(morphlogyAnalyzer, answerPath, testPath);
+    }
+
+    @Test
+    public void shoudGiveCorretWords() throws IOException {
+        Analyzer morphlogyAnalyzer = new RussianAnalyzer();
+        String answerPath = "/russian/russian-analayzer-answer.txt";
+        String testPath = "/russian/russian-analayzer-data.txt";
+
+        testAnalayzer(morphlogyAnalyzer, answerPath, testPath);
+    }
+
+    private void testAnalayzer(Analyzer morphlogyAnalyzer, String answerPath, String testPath) throws IOException {
+        InputStream stream = this.getClass().getResourceAsStream(answerPath);
+        BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
+        String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
+        HashSet<String> answer = new HashSet<String>(Arrays.asList(strings));
+        stream.close();
+
+        stream = this.getClass().getResourceAsStream(testPath);
+
+        InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
+
+        TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
+        HashSet<String> result = new HashSet<String>();
+        while (tokenStream.incrementToken()) {
+            TermAttribute attribute1 = tokenStream.getAttribute(TermAttribute.class);
+            result.add(attribute1.term());
+        }
+
+        stream.close();
+
+        assertThat(result, equalTo(answer));
+    }
+}
--- a/dictionary-reader/src/test/java/org/apache/lucene/morphology/LuceneMorphTest.java
+++ b/dictionary-reader/src/test/java/org/apache/lucene/morphology/LuceneMorphTest.java
@@ -0,0 +1,62 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology;
+
+import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
+import org.junit.Test;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.hamcrest.CoreMatchers.equalTo;
+import static org.junit.Assert.assertThat;
+
+
+public class LuceneMorphTest {
+
+    @Test
+    public void englishMorphologyShouldGetCorrectNormalForm() throws IOException {
+        LuceneMorphology luceneMorph = new EnglishLuceneMorphology();
+        String pathToTestData = "/english/english-morphology-test.txt";
+        testMorphology(luceneMorph, pathToTestData);
+    }
+
+    @Test
+    public void russianMorphologyShouldGetCorrectNormalForm() throws IOException {
+        LuceneMorphology luceneMorph = new RussianLuceneMorphology();
+        String pathToTestData = "/russian/russian-morphology-test.txt";
+        testMorphology(luceneMorph, pathToTestData);
+    }
+
+    private void testMorphology(LuceneMorphology luceneMorph, String pathToTestData) throws IOException {
+        InputStream stream = this.getClass().getResourceAsStream(pathToTestData);
+        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
+        String s = bufferedReader.readLine();
+        while (s != null) {
+            String[] qa = s.trim().split(" ");
+            Set<String> result = new HashSet<String>();
+            result.addAll(Arrays.asList(qa).subList(1, qa.length));
+            Set<String> stringList = new HashSet<String>(luceneMorph.getNormalForms(qa[0]));
+            assertThat(stringList, equalTo(result));
+            s = bufferedReader.readLine();
+        }
+    }
+}
--- a/dictionary-reader/src/test/resources/english/english-morphology-test.txt
+++ b/dictionary-reader/src/test/resources/english/english-morphology-test.txt
@@ -0,0 +1,8 @@
+purchases purchase
+existing exist
+was be
+men man
+bore bore bear
+grown grow grown
+came come
+md md
--- a/dictionary-reader/src/test/resources/english/englsih-analayzer-answer.txt
+++ b/dictionary-reader/src/test/resources/english/englsih-analayzer-answer.txt
@@ -0,0 +1 @@
+following follow the instruction exactly will be help ensure the best well good result
--- a/dictionary-reader/src/test/resources/english/englsih-analayzer-data.txt
+++ b/dictionary-reader/src/test/resources/english/englsih-analayzer-data.txt
@@ -0,0 +1 @@
+Following the instructions exactly will help ensure the best results
--- a/dictionary-reader/src/test/resources/russian/russian-analayzer-answer.txt
+++ b/dictionary-reader/src/test/resources/russian/russian-analayzer-answer.txt
@@ -0,0 +1 @@
+в результат крушение погибнуть командир отряд специальный назначение пря при переть гувд ростовский область полковник милиция михаил перов и предприниматель
--- a/dictionary-reader/src/test/resources/russian/russian-analayzer-data.txt
+++ b/dictionary-reader/src/test/resources/russian/russian-analayzer-data.txt
@@ -0,0 +1 @@
+В результате крушения погибли командир отряда специального назначения при ГУВД Ростовской области полковник милиции Михаил Перов и предприниматель
--- a/dictionary-reader/src/test/resources/russian/russian-morphology-test.txt
+++ b/dictionary-reader/src/test/resources/russian/russian-morphology-test.txt
@@ -0,0 +1,19 @@
+еду еда ехать
+тестов тест
+вина вино вина
+вино вино
+ехать ехать
+ананасов ананас ананасовый
+сухой сухой
+дураков дурак
+пушка пушка пушок
+пушок пушок
+пушек пушка
+козлов козлов козловый козел
+жуков жуков жук
+красив красить красивый
+красивая красивый
+тосклив тоскливый
+лучший хороший
+на на
+тест тест тесто
--- a/english/pom.xml
+++ b/english/pom.xml
@@ -1,5 +1,6 @@
 <?xml version="1.0"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
    <parent>
        <artifactId>morphology</artifactId>
        <groupId>org.apache.lucene.morphology</groupId>
@@ -12,6 +13,7 @@
    <version>0.9-SNAPSHOT</version>
    <url>http://maven.apache.org</url>
    <dependencies>
+
        <dependency>
            <groupId>org.apache.lucene.morphology</groupId>
            <artifactId>morph</artifactId>
--- a/english/src/main/java/org/apache/lucene/morphology/EnglishAnalyzer.java
+++ b/english/src/main/java/org/apache/lucene/morphology/EnglishAnalyzer.java
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology;
+
+import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer;
+
+import java.io.IOException;
+
+
+public class EnglishAnalyzer extends MorphologyAnalyzer {
+
+    public EnglishAnalyzer() throws IOException {
+        super(new EnglishLuceneMorphology());
+    }
+
+}
--- a/english/src/main/java/org/apache/lucene/morphology/EnglishLetterDecoderEncoder.java
+++ b/english/src/main/java/org/apache/lucene/morphology/EnglishLetterDecoderEncoder.java
@@ -0,0 +1,111 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology;
+
+import java.util.ArrayList;
+
+
+public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder {
+    public static final int ENGLISH_SMALL_LETTER_OFFSET = 96;
+    static public int SUFFIX_LENGTH = 6;
+    public static final int DASH_CHAR = 45;
+    public static final int DASH_CODE = 27;
+
+    public Integer encode(String string) {
+        if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12);
+        int result = 0;
+        for (int i = 0; i < string.length(); i++) {
+            int c = 0 + string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET;
+            if (c == 45 - ENGLISH_SMALL_LETTER_OFFSET) {
+                c = DASH_CODE;
+            }
+            if (c < 0 || c > 27)
+                throw new WrongCharaterException("Symblo " + string.charAt(i) + " is not small cirillic letter");
+            result = result * 28 + c;
+        }
+        for (int i = string.length(); i < 6; i++) {
+            result *= 28;
+        }
+        return result;
+    }
+
+    public int[] encodeToArray(String s) {
+
+        ArrayList<Integer> integers = new ArrayList<Integer>();
+        while (s.length() > 6) {
+            integers.add(encode(s.substring(0, 6)));
+            s = s.substring(6);
+        }
+        integers.add(encode(s));
+        int[] ints = new int[integers.size()];
+        int pos = 0;
+        for (Integer i : integers) {
+            ints[pos] = i;
+            pos++;
+        }
+        return ints;
+    }
+
+    public String decodeArray(int[] array) {
+        String result = "";
+        for (int i : array) {
+            result += decode(i);
+        }
+        return result;
+    }
+
+
+    public String decode(Integer suffixN) {
+        String result = "";
+        while (suffixN > 27) {
+            int c = suffixN % 28 + ENGLISH_SMALL_LETTER_OFFSET;
+            if (c == ENGLISH_SMALL_LETTER_OFFSET) {
+                suffixN /= 28;
+                continue;
+            }
+            if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
+            result = (char) c + result;
+            suffixN /= 28;
+        }
+        long c = suffixN + ENGLISH_SMALL_LETTER_OFFSET;
+        if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
+        result = (char) c + result;
+        return result;
+    }
+
+    public boolean checkCharacter(char c) {
+        int code = 0 + c;
+        if (code == 45) return true;
+        code -= ENGLISH_SMALL_LETTER_OFFSET;
+        if (code > 0 && code < 27) return true;
+        return false;
+    }
+
+
+    public boolean checkString(String word) {
+        for (int i = 0; i < word.length(); i++) {
+            if (!checkCharacter(word.charAt(i))) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    public String cleanString(String s) {
+        return s;
+    }
+
+}
--- a/english/src/main/java/org/apache/lucene/morphology/EnglishLuceneMorphology.java
+++ b/english/src/main/java/org/apache/lucene/morphology/EnglishLuceneMorphology.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology;
+
+import java.io.IOException;
+
+
+public class EnglishLuceneMorphology extends LuceneMorphology {
+
+    public EnglishLuceneMorphology() throws IOException {
+        super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
+    }
+}
--- a/english/src/main/java/org/apache/lucene/morphology/EnglishMorphology.java
+++ b/english/src/main/java/org/apache/lucene/morphology/EnglishMorphology.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology;
+
+import java.io.IOException;
+
+
+public class EnglishMorphology extends MorphologyImpl {
+
+    public EnglishMorphology() throws IOException {
+        super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
+    }
+}
--- a/russian/pom.xml
+++ b/russian/pom.xml
@@ -1,5 +1,6 @@
 <?xml version="1.0"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
    <parent>
        <artifactId>morphology</artifactId>
        <groupId>org.apache.lucene.morphology</groupId>
@@ -13,6 +14,7 @@
    <url>http://maven.apache.org</url>
    <dependencies>

+
        <dependency>
            <groupId>org.apache.lucene.morphology</groupId>
            <artifactId>morph</artifactId>
--- a/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoderTest.java
+++ b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoderTest.java
@@ -17,8 +17,6 @@ package org.apache.lucene.morphology.russian;

 import org.apache.lucene.morphology.SuffixToLongException;
 import org.apache.lucene.morphology.WrongCharaterException;
-import static org.hamcrest.core.IsEqual.equalTo;
-import static org.junit.Assert.assertThat;
 import org.junit.Before;
 import org.junit.Test;

@@ -27,6 +25,9 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;

+import static org.hamcrest.core.IsEqual.equalTo;
+import static org.junit.Assert.assertThat;
+
 public class RussianLetterDecoderEncoderTest {
    private RussianLetterDecoderEncoder decoderEncoder;

@@ -37,7 +38,7 @@ public class RussianLetterDecoderEncoderTest {


    @Test
-    public void testShouldPreserStringComporision() throws IOException {
+    public void testShouldPreserverStringComporision() throws IOException {
        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-monotonic.txt");
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
        String s = bufferedReader.readLine();
@@ -52,22 +53,22 @@ public class RussianLetterDecoderEncoderTest {


    @Test
-    public void testShouldCorretDecodeEncode() throws IOException {
+    public void testShouldCorrectDecodeEncode() throws IOException {
        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data.txt");
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
        String s = bufferedReader.readLine();
        while (s != null) {
            String[] qa = s.trim().split(" ");
            if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) {
-                Integer ecodedSuffix = decoderEncoder.encode(qa[0]);
-                assertThat(decoderEncoder.decode(ecodedSuffix), equalTo(qa[1]));
+                Integer encodedSuffix = decoderEncoder.encode(qa[0]);
+                assertThat(decoderEncoder.decode(encodedSuffix), equalTo(qa[1]));
            }
            s = bufferedReader.readLine();
        }
    }

    @Test
-    public void testShouldCorretDecodeEncodeStringToArray() throws IOException {
+    public void testShouldCorrectDecodeEncodeStringToArray() throws IOException {
        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data-for-array.txt");
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
        String s = bufferedReader.readLine();
@@ -85,7 +86,7 @@ public class RussianLetterDecoderEncoderTest {
    }

    @Test(expected = WrongCharaterException.class)
-    public void shouldThrownExeptionIfSuffixContainWrongCharater() {
+    public void shouldThrownExceptionIfSuffixContainWrongCharater() {
        decoderEncoder.encode("1");
    }
 }
				`@@ -0,0 +1 @@`
				`following follow the instruction exactly will be help ensure the best well good result`
				`@@ -0,0 +1 @@`
				`Following the instructions exactly will help ensure the best results`
				`@@ -0,0 +1 @@`
				`в результат крушение погибнуть командир отряд специальный назначение пря при переть гувд ростовский область полковник милиция михаил перов и предприниматель`
				`@@ -0,0 +1 @@`
				`В результате крушения погибли командир отряда специального назначения при ГУВД Ростовской области полковник милиции Михаил Перов и предприниматель`