fixing problem with - in word the english morphology not work correctly because word forms contains it

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@97 d817d54c-26ab-11de-abc9-2f7d1455ff7a
2010-05-29 15:54:44 +00:00
parent 3ca9eb2cb9
commit 7bf8ef7d6f
12 changed files with 6393 additions and 6381 deletions
@@ -71,11 +71,17 @@ public class DictonaryReader {
            List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
            FlexiaModel flexiaModel = models.get(0);
            if (models.size() > 0 && !ingnoredForm.contains(flexiaModel.getCode())) {
+
                WordCard card = new WordCard(flexiaModel.create(wordBase), wordBase, flexiaModel.getSuffix());
                for (FlexiaModel fm : models) {
                    card.addFlexia(fm);
                }
-                wordProccessor.proccess(card);
+//                if(card.getBase().equals("face") || card.getBase().equals("fac")){
+//                    System.out.println(models);
+//                    System.out.println(card);
+                    wordProccessor.process(card);
+                //}
+
            }
        }
    }
@@ -17,7 +17,7 @@
 package org.apache.lucene.morphology.dictionary;

 /**
- * Represent inofrmation of how word form created form it imutible part.
+ * Represent information of how word form created form it imutible part.
 */
 public class FlexiaModel {
    private String code;
@@ -60,6 +60,10 @@ public class FlexiaModel {

    @Override
    public String toString() {
-        return prefix + " " + suffix;
+        return "FlexiaModel{" +
+                "code='" + code + '\'' +
+                ", suffix='" + suffix + '\'' +
+                ", prefix='" + prefix + '\'' +
+                '}';
    }
 }
@@ -26,20 +26,20 @@ import java.util.*;


 //todo made refactoring this class
-public class StatiticsCollector implements WordProccessor {
-    private TreeMap<String, Set<Heuristic>> inversIndex = new TreeMap<String, Set<Heuristic>>();
-    private Map<Set<Heuristic>, Integer> ruleInverIndex = new HashMap<Set<Heuristic>, Integer>();
+public class StatisticsCollector implements WordProccessor {
+    private TreeMap<String, Set<Heuristic>> inverseIndex = new TreeMap<String, Set<Heuristic>>();
+    private Map<Set<Heuristic>, Integer> ruleInverseIndex = new HashMap<Set<Heuristic>, Integer>();
    private List<Set<Heuristic>> rules = new ArrayList<Set<Heuristic>>();
    private GrammaReader grammaReader;
    private LetterDecoderEncoder decoderEncoder;


-    public StatiticsCollector(GrammaReader grammaReader, LetterDecoderEncoder decoderEncoder) {
+    public StatisticsCollector(GrammaReader grammaReader, LetterDecoderEncoder decoderEncoder) {
        this.grammaReader = grammaReader;
        this.decoderEncoder = decoderEncoder;
    }

-    public void proccess(WordCard wordCard) throws IOException {
+    public void process(WordCard wordCard) throws IOException {
        cleanWordCard(wordCard);
        String normalStringMorph = wordCard.getWordsFroms().get(0).getCode();
        String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
@@ -47,13 +47,13 @@ public class StatiticsCollector implements WordProccessor {
        if (!decoderEncoder.checkString(word)) return;

        for (FlexiaModel fm : wordCard.getWordsFroms()) {
-            if (!decoderEncoder.checkString(fm.create(wordCard.getBase()))) continue;
+            if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) continue;
            Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph);
            String form = revertWord(fm.create(wordCard.getBase()));
-            Set<Heuristic> suffixHeuristics = inversIndex.get(form);
+            Set<Heuristic> suffixHeuristics = inverseIndex.get(form);
            if (suffixHeuristics == null) {
                suffixHeuristics = new HashSet<Heuristic>();
-                inversIndex.put(form, suffixHeuristics);
+                inverseIndex.put(form, suffixHeuristics);
            }
            suffixHeuristics.add(heuristic);
        }
@@ -76,24 +76,24 @@ public class StatiticsCollector implements WordProccessor {
        Map<Integer, Integer> dist = new TreeMap<Integer, Integer>();
        Set<Heuristic> prevSet = null;
        int count = 0;
-        for (String key : inversIndex.keySet()) {
-            Set<Heuristic> currentSet = inversIndex.get(key);
+        for (String key : inverseIndex.keySet()) {
+            Set<Heuristic> currentSet = inverseIndex.get(key);
            if (!currentSet.equals(prevSet)) {
                Integer d = dist.get(key.length());
                dist.put(key.length(), 1 + (d == null ? 0 : d));
                prevSet = currentSet;
                count++;
-                if (!ruleInverIndex.containsKey(currentSet)) {
-                    ruleInverIndex.put(currentSet, rules.size());
+                if (!ruleInverseIndex.containsKey(currentSet)) {
+                    ruleInverseIndex.put(currentSet, rules.size());
                    rules.add(currentSet);
                }
            }
        }
        System.out.println("Word with diffirent rules " + count);
-        System.out.println("All ivers words " + inversIndex.size());
+        System.out.println("All ivers words " + inverseIndex.size());
        System.out.println(dist);
-        System.out.println("diffirent rule count " + ruleInverIndex.size());
-        Heuristic[][] heuristics = new Heuristic[ruleInverIndex.size()][];
+        System.out.println("diffirent rule count " + ruleInverseIndex.size());
+        Heuristic[][] heuristics = new Heuristic[ruleInverseIndex.size()][];
        int index = 0;
        for (Set<Heuristic> hs : rules) {
            heuristics[index] = new Heuristic[hs.size()];
@@ -109,12 +109,12 @@ public class StatiticsCollector implements WordProccessor {
        short[] rulesId = new short[count];
        count = 0;
        prevSet = null;
-        for (String key : inversIndex.keySet()) {
-            Set<Heuristic> currentSet = inversIndex.get(key);
+        for (String key : inverseIndex.keySet()) {
+            Set<Heuristic> currentSet = inverseIndex.get(key);
            if (!currentSet.equals(prevSet)) {
                int[] word = decoderEncoder.encodeToArray(key);
                ints[count] = word;
-                rulesId[count] = (short) ruleInverIndex.get(currentSet).intValue();
+                rulesId[count] = (short) ruleInverseIndex.get(currentSet).intValue();
                count++;
                prevSet = currentSet;
            }
@@ -69,4 +69,14 @@ public class WordCard {
    public void setWordsFroms(List<FlexiaModel> wordsFroms) {
        this.wordsFroms = wordsFroms;
    }
+
+    @Override
+    public String toString() {
+        return "WordCard{" +
+                "canonicalFrom='" + canonicalFrom + '\'' +
+                ", base='" + base + '\'' +
+                ", canonicalSuffix='" + canonicalSuffix + '\'' +
+                ", wordsFroms=" + wordsFroms +
+                '}';
+    }
 }
@@ -24,5 +24,5 @@ import java.io.IOException;
 */
 public interface WordProccessor {

-    public void proccess(WordCard wordCard) throws IOException;
+    public void process(WordCard wordCard) throws IOException;
 }
@@ -18,7 +18,7 @@ package org.apache.lucene.morphology.generator;

 import org.apache.lucene.morphology.dictionary.DictonaryReader;
 import org.apache.lucene.morphology.dictionary.GrammaReader;
-import org.apache.lucene.morphology.dictionary.StatiticsCollector;
+import org.apache.lucene.morphology.dictionary.StatisticsCollector;
 import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder;

 import java.io.IOException;
@@ -32,9 +32,9 @@ public class EnglishHeuristicBuilder {
        DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());

        EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
-        StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo, decoderEncoder);
-        dictonaryReader.proccess(statiticsCollector);
-        statiticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
+        StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
+        dictonaryReader.proccess(statisticsCollector);
+        statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");

    }
 }
@@ -18,7 +18,7 @@ package org.apache.lucene.morphology.generator;

 import org.apache.lucene.morphology.dictionary.DictonaryReader;
 import org.apache.lucene.morphology.dictionary.GrammaReader;
-import org.apache.lucene.morphology.dictionary.StatiticsCollector;
+import org.apache.lucene.morphology.dictionary.StatisticsCollector;
 import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;

 import java.io.IOException;
@@ -31,9 +31,9 @@ public class RussianHeuristicBuilder {
        DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());

        RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
-        StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo, decoderEncoder);
-        dictonaryReader.proccess(statiticsCollector);
-        statiticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");
+        StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
+        dictonaryReader.proccess(statisticsCollector);
+        statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");

    }
 }