adding test for all words, refactors test and dictonary reading

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@99 d817d54c-26ab-11de-abc9-2f7d1455ff7a
2010-10-08 11:35:13 +00:00
parent e8399999c3
commit 76e68a11e0
26 changed files with 730 additions and 48 deletions
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictionaryReader.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictionaryReader.java
@ -34,16 +34,19 @@ public class DictionaryReader {
    private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
    private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
    private Set<String> ignoredForm = new HashSet<String>();
+    private List<WordFilter> filters = new ArrayList<WordFilter>();

-    public DictionaryReader(String fileName, Set<String> ignoredForm) {
+    public DictionaryReader(String fileName, Set<String> ignoredForm, List<WordFilter> filters) {
        this.fileName = fileName;
        this.ignoredForm = ignoredForm;
+        this.filters = filters;
    }

-    public DictionaryReader(String fileName, String fileEncoding, Set<String> ignoredForm) {
+    public DictionaryReader(String fileName, String fileEncoding, Set<String> ignoredForm, List<WordFilter> filters) {
        this.fileName = fileName;
        this.fileEncoding = fileEncoding;
        this.ignoredForm = ignoredForm;
+        this.filters = filters;
    }


@ -60,30 +63,46 @@ public class DictionaryReader {
    private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
        String s = reader.readLine();
        int count = Integer.valueOf(s);
+        int actual = 0;
        for (int i = 0; i < count; i++) {
            s = reader.readLine();
            if (i % 10000 == 0) System.out.println("Proccess " + i + " wordBase of " + count);

-            String[] wd = s.split(" ");
-            String wordBase = wd[0].toLowerCase();
-            if (wordBase.startsWith("-")) continue;
-            wordBase = "#".equals(wordBase) ? "" : wordBase;
-            List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
-            FlexiaModel flexiaModel = models.get(0);
-            if (models.size() > 0 && !ignoredForm.contains(flexiaModel.getCode())) {
-
-                WordCard card = new WordCard(flexiaModel.create(wordBase), wordBase, flexiaModel.getSuffix());
-                for (FlexiaModel fm : models) {
-                    card.addFlexia(fm);
-                }
-//                if(card.getBase().equals("face") || card.getBase().equals("fac")){
-//                    System.out.println(models);
-//                    System.out.println(card);
-                    wordProccessor.process(card);
-                //}
+            WordCard card = buildForm(s);

+            for (WordFilter wf : filters) {
+                if (card == null) break;
+                card = wf.transform(card);
            }
+
+            if (card == null) {
+                continue;
+            }
+
+            wordProccessor.process(card);
+            actual++;
+
        }
+        System.out.println("Finished word processing actual words " + actual);
+    }
+
+    private WordCard buildForm(String s) {
+        String[] wd = s.split(" ");
+        String wordBase = wd[0].toLowerCase();
+        if (wordBase.startsWith("-")) return null;
+        wordBase = "#".equals(wordBase) ? "" : wordBase;
+        List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
+        FlexiaModel flexiaModel = models.get(0);
+        if (models.size() == 0 || ignoredForm.contains(flexiaModel.getCode())) {
+            return null;
+        }
+
+        WordCard card = new WordCard(flexiaModel.create(wordBase), wordBase, flexiaModel.getSuffix());
+
+        for (FlexiaModel fm : models) {
+            card.addFlexia(fm);
+        }
+        return card;
    }


@ -122,7 +141,7 @@ public class DictionaryReader {
        String[] fl = line.split("\\*");
        // we inored all forms thats
        if (fl.length == 3) {
-            System.out.println(line);
+            //System.out.println(line);
            // flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase())));
        }
        if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java
@ -66,4 +66,26 @@ public class FlexiaModel {
                ", prefix='" + prefix + '\'' +
                '}';
    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        FlexiaModel that = (FlexiaModel) o;
+
+        if (code != null ? !code.equals(that.code) : that.code != null) return false;
+        if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false;
+        if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false;
+
+        return true;
+    }
+
+    @Override
+    public int hashCode() {
+        int result = code != null ? code.hashCode() : 0;
+        result = 31 * result + (suffix != null ? suffix.hashCode() : 0);
+        result = 31 * result + (prefix != null ? prefix.hashCode() : 0);
+        return result;
+    }
 }
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/StatisticsCollector.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/StatisticsCollector.java
@ -42,12 +42,8 @@ public class StatisticsCollector implements WordProccessor {
    public void process(WordCard wordCard) throws IOException {
        cleanWordCard(wordCard);
        String normalStringMorph = wordCard.getWordsForms().get(0).getCode();
-        String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
-        if (word.contains("-")) return;
-        if (!decoderEncoder.checkString(word)) return;

        for (FlexiaModel fm : wordCard.getWordsForms()) {
-            if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) continue;
            Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph);
            String form = revertWord(fm.create(wordCard.getBase()));
            Set<Heuristic> suffixHeuristics = inverseIndex.get(form);
@ -138,8 +134,8 @@ public class StatisticsCollector implements WordProccessor {
        Integer length = getCommonLength(form, normalForm);
        Integer actualSuffixLengh = form.length() - length;
        String actualNormalSuffix = normalForm.substring(length);
-        Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode().substring(0, 2));
-        Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm.substring(0, 2));
+        Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode());
+        Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm);
        return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue());
    }

--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCard.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCard.java
@ -38,6 +38,10 @@ public class WordCard {
        wordsForms.add(flexiaModel);
    }

+    public void removeFlexia(FlexiaModel flexiaModel) {
+        wordsForms.remove(flexiaModel);
+    }
+
    public String getCanonicalForm() {
        return canonicalForm;
    }
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCleaner.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCleaner.java
@ -0,0 +1,50 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology.dictionary;
+
+import org.apache.lucene.morphology.LetterDecoderEncoder;
+
+import java.util.LinkedList;
+import java.util.List;
+
+
+public class WordCleaner implements WordFilter {
+
+    private LetterDecoderEncoder decoderEncoder;
+
+    public WordCleaner(LetterDecoderEncoder decoderEncoder) {
+        this.decoderEncoder = decoderEncoder;
+    }
+
+    public WordCard transform(WordCard wordCard) {
+        String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
+
+        if (word.contains("-")) return null;
+        if (!decoderEncoder.checkString(word)) return null;
+
+        List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>();
+        for (FlexiaModel fm : wordCard.getWordsForms()) {
+            if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) {
+                flexiaModelsToRemove.add(fm);
+            }
+        }
+        for (FlexiaModel fm : flexiaModelsToRemove) {
+            wordCard.removeFlexia(fm);
+        }
+
+        return wordCard;
+    }
+}
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordFilter.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordFilter.java
@ -0,0 +1,24 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.morphology.dictionary;
+
+
+public interface WordFilter {
+
+    public WordCard transform(WordCard wordCard);
+
+}
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordStringCleaner.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordStringCleaner.java
@ -0,0 +1,49 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology.dictionary;
+
+import org.apache.lucene.morphology.LetterDecoderEncoder;
+
+import java.util.List;
+
+
+public class WordStringCleaner implements WordFilter {
+
+    private LetterDecoderEncoder decoderEncoder;
+
+    public WordStringCleaner(LetterDecoderEncoder decoderEncoder) {
+        this.decoderEncoder = decoderEncoder;
+    }
+
+    public WordCard transform(WordCard wordCard) {
+        wordCard.setBase(cleanString(wordCard.getBase()));
+        wordCard.setCanonicalForm(cleanString(wordCard.getCanonicalForm()));
+        wordCard.setCanonicalSuffix(cleanString(wordCard.getCanonicalSuffix()));
+        List<FlexiaModel> models = wordCard.getWordsForms();
+        for (FlexiaModel m : models) {
+            m.setSuffix(cleanString(m.getSuffix()));
+            m.setPrefix(cleanString(m.getPrefix()));
+            //made correct code
+            m.setCode(m.getCode().substring(0, 2));
+        }
+        return wordCard;
+    }
+
+
+    private String cleanString(String s) {
+        return decoderEncoder.cleanString(s);
+    }
+}
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java
@ -16,22 +16,24 @@

 package org.apache.lucene.morphology.generator;

-import org.apache.lucene.morphology.dictionary.DictionaryReader;
-import org.apache.lucene.morphology.dictionary.GrammaReader;
-import org.apache.lucene.morphology.dictionary.StatisticsCollector;
-import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder;
+import org.apache.lucene.morphology.EnglishLetterDecoderEncoder;
+import org.apache.lucene.morphology.dictionary.*;

 import java.io.IOException;
+import java.util.Arrays;
 import java.util.HashSet;
+import java.util.List;


 public class EnglishHeuristicBuilder {
    public static void main(String[] args) throws IOException {

        GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/egramtab.tab");
-        DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());
-
        EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
+        List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
+
+        DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>(), filters);
+
        StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
        dictionaryReader.proccess(statisticsCollector);
        statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java
@ -16,21 +16,23 @@

 package org.apache.lucene.morphology.generator;

-import org.apache.lucene.morphology.dictionary.DictionaryReader;
-import org.apache.lucene.morphology.dictionary.GrammaReader;
-import org.apache.lucene.morphology.dictionary.StatisticsCollector;
+import org.apache.lucene.morphology.dictionary.*;
 import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;

 import java.io.IOException;
+import java.util.Arrays;
 import java.util.HashSet;
+import java.util.List;


 public class RussianHeuristicBuilder {
    public static void main(String[] args) throws IOException {
        GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
-        DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
-
        RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
+        List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
+
+        DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>(), filters);
+
        StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
        dictionaryReader.proccess(statisticsCollector);
        statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");