fixing problem with - in word the english morphology not work correctly because word forms contains it

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@97 d817d54c-26ab-11de-abc9-2f7d1455ff7a
2010-05-29 15:54:44 +00:00
parent 3ca9eb2cb9
commit 7bf8ef7d6f
12 changed files with 6393 additions and 6381 deletions
@@ -71,11 +71,17 @@ public class DictonaryReader {
            List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
            FlexiaModel flexiaModel = models.get(0);
            if (models.size() > 0 && !ingnoredForm.contains(flexiaModel.getCode())) {
+
                WordCard card = new WordCard(flexiaModel.create(wordBase), wordBase, flexiaModel.getSuffix());
                for (FlexiaModel fm : models) {
                    card.addFlexia(fm);
                }
-                wordProccessor.proccess(card);
+//                if(card.getBase().equals("face") || card.getBase().equals("fac")){
+//                    System.out.println(models);
+//                    System.out.println(card);
+                    wordProccessor.process(card);
+                //}
+
            }
        }
    }
@@ -17,7 +17,7 @@
 package org.apache.lucene.morphology.dictionary;

 /**
- * Represent inofrmation of how word form created form it imutible part.
+ * Represent information of how word form created form it imutible part.
 */
 public class FlexiaModel {
    private String code;
@@ -60,6 +60,10 @@ public class FlexiaModel {

    @Override
    public String toString() {
-        return prefix + " " + suffix;
+        return "FlexiaModel{" +
+                "code='" + code + '\'' +
+                ", suffix='" + suffix + '\'' +
+                ", prefix='" + prefix + '\'' +
+                '}';
    }
 }
@@ -26,20 +26,20 @@ import java.util.*;


 //todo made refactoring this class
-public class StatiticsCollector implements WordProccessor {
-    private TreeMap<String, Set<Heuristic>> inversIndex = new TreeMap<String, Set<Heuristic>>();
-    private Map<Set<Heuristic>, Integer> ruleInverIndex = new HashMap<Set<Heuristic>, Integer>();
+public class StatisticsCollector implements WordProccessor {
+    private TreeMap<String, Set<Heuristic>> inverseIndex = new TreeMap<String, Set<Heuristic>>();
+    private Map<Set<Heuristic>, Integer> ruleInverseIndex = new HashMap<Set<Heuristic>, Integer>();
    private List<Set<Heuristic>> rules = new ArrayList<Set<Heuristic>>();
    private GrammaReader grammaReader;
    private LetterDecoderEncoder decoderEncoder;


-    public StatiticsCollector(GrammaReader grammaReader, LetterDecoderEncoder decoderEncoder) {
+    public StatisticsCollector(GrammaReader grammaReader, LetterDecoderEncoder decoderEncoder) {
        this.grammaReader = grammaReader;
        this.decoderEncoder = decoderEncoder;
    }

-    public void proccess(WordCard wordCard) throws IOException {
+    public void process(WordCard wordCard) throws IOException {
        cleanWordCard(wordCard);
        String normalStringMorph = wordCard.getWordsFroms().get(0).getCode();
        String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
@@ -47,13 +47,13 @@ public class StatiticsCollector implements WordProccessor {
        if (!decoderEncoder.checkString(word)) return;

        for (FlexiaModel fm : wordCard.getWordsFroms()) {
-            if (!decoderEncoder.checkString(fm.create(wordCard.getBase()))) continue;
+            if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) continue;
            Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph);
            String form = revertWord(fm.create(wordCard.getBase()));
-            Set<Heuristic> suffixHeuristics = inversIndex.get(form);
+            Set<Heuristic> suffixHeuristics = inverseIndex.get(form);
            if (suffixHeuristics == null) {
                suffixHeuristics = new HashSet<Heuristic>();
-                inversIndex.put(form, suffixHeuristics);
+                inverseIndex.put(form, suffixHeuristics);
            }
            suffixHeuristics.add(heuristic);
        }
@@ -76,24 +76,24 @@ public class StatiticsCollector implements WordProccessor {
        Map<Integer, Integer> dist = new TreeMap<Integer, Integer>();
        Set<Heuristic> prevSet = null;
        int count = 0;
-        for (String key : inversIndex.keySet()) {
-            Set<Heuristic> currentSet = inversIndex.get(key);
+        for (String key : inverseIndex.keySet()) {
+            Set<Heuristic> currentSet = inverseIndex.get(key);
            if (!currentSet.equals(prevSet)) {
                Integer d = dist.get(key.length());
                dist.put(key.length(), 1 + (d == null ? 0 : d));
                prevSet = currentSet;
                count++;
-                if (!ruleInverIndex.containsKey(currentSet)) {
-                    ruleInverIndex.put(currentSet, rules.size());
+                if (!ruleInverseIndex.containsKey(currentSet)) {
+                    ruleInverseIndex.put(currentSet, rules.size());
                    rules.add(currentSet);
                }
            }
        }
        System.out.println("Word with diffirent rules " + count);
-        System.out.println("All ivers words " + inversIndex.size());
+        System.out.println("All ivers words " + inverseIndex.size());
        System.out.println(dist);
-        System.out.println("diffirent rule count " + ruleInverIndex.size());
-        Heuristic[][] heuristics = new Heuristic[ruleInverIndex.size()][];
+        System.out.println("diffirent rule count " + ruleInverseIndex.size());
+        Heuristic[][] heuristics = new Heuristic[ruleInverseIndex.size()][];
        int index = 0;
        for (Set<Heuristic> hs : rules) {
            heuristics[index] = new Heuristic[hs.size()];
@@ -109,12 +109,12 @@ public class StatiticsCollector implements WordProccessor {
        short[] rulesId = new short[count];
        count = 0;
        prevSet = null;
-        for (String key : inversIndex.keySet()) {
-            Set<Heuristic> currentSet = inversIndex.get(key);
+        for (String key : inverseIndex.keySet()) {
+            Set<Heuristic> currentSet = inverseIndex.get(key);
            if (!currentSet.equals(prevSet)) {
                int[] word = decoderEncoder.encodeToArray(key);
                ints[count] = word;
-                rulesId[count] = (short) ruleInverIndex.get(currentSet).intValue();
+                rulesId[count] = (short) ruleInverseIndex.get(currentSet).intValue();
                count++;
                prevSet = currentSet;
            }
@@ -69,4 +69,14 @@ public class WordCard {
    public void setWordsFroms(List<FlexiaModel> wordsFroms) {
        this.wordsFroms = wordsFroms;
    }
+
+    @Override
+    public String toString() {
+        return "WordCard{" +
+                "canonicalFrom='" + canonicalFrom + '\'' +
+                ", base='" + base + '\'' +
+                ", canonicalSuffix='" + canonicalSuffix + '\'' +
+                ", wordsFroms=" + wordsFroms +
+                '}';
+    }
 }
@@ -24,5 +24,5 @@ import java.io.IOException;
 */
 public interface WordProccessor {

-    public void proccess(WordCard wordCard) throws IOException;
+    public void process(WordCard wordCard) throws IOException;
 }
@@ -18,7 +18,7 @@ package org.apache.lucene.morphology.generator;

 import org.apache.lucene.morphology.dictionary.DictonaryReader;
 import org.apache.lucene.morphology.dictionary.GrammaReader;
-import org.apache.lucene.morphology.dictionary.StatiticsCollector;
+import org.apache.lucene.morphology.dictionary.StatisticsCollector;
 import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder;

 import java.io.IOException;
@@ -32,9 +32,9 @@ public class EnglishHeuristicBuilder {
        DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());

        EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
-        StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo, decoderEncoder);
-        dictonaryReader.proccess(statiticsCollector);
-        statiticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
+        StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
+        dictonaryReader.proccess(statisticsCollector);
+        statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");

    }
 }
@@ -18,7 +18,7 @@ package org.apache.lucene.morphology.generator;

 import org.apache.lucene.morphology.dictionary.DictonaryReader;
 import org.apache.lucene.morphology.dictionary.GrammaReader;
-import org.apache.lucene.morphology.dictionary.StatiticsCollector;
+import org.apache.lucene.morphology.dictionary.StatisticsCollector;
 import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;

 import java.io.IOException;
@@ -31,9 +31,9 @@ public class RussianHeuristicBuilder {
        DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());

        RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
-        StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo, decoderEncoder);
-        dictonaryReader.proccess(statiticsCollector);
-        statiticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");
+        StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
+        dictonaryReader.proccess(statisticsCollector);
+        statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");

    }
 }
@@ -32,7 +32,7 @@ import java.util.HashSet;
 public class EnglishAnalayzerTest {

    @Test
-    public void shoudGiveCorretWords() throws IOException {
+    public void shouldGiveCorrectWords() throws IOException {
        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-answer.txt");
        BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
        String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
@@ -19,33 +19,33 @@ import java.io.Serializable;


 public class Heuristic implements Serializable {
-    byte actualSuffixLengh;
+    byte actualSuffixLength;
    String actualNormalSuffix;
    short formMorphInfo;
    short normalFormMorphInfo;

    public Heuristic(String s) {
        String[] strings = s.split("\\|");
-        actualSuffixLengh = Byte.valueOf(strings[0]);
+        actualSuffixLength = Byte.valueOf(strings[0]);
        actualNormalSuffix = strings[1];
        formMorphInfo = Short.valueOf(strings[2]);
        normalFormMorphInfo = Short.valueOf(strings[3]);
    }

-    public Heuristic(byte actualSuffixLengh, String actualNormalSuffix, short formMorphInfo, short normalFormMorphInfo) {
-        this.actualSuffixLengh = actualSuffixLengh;
+    public Heuristic(byte actualSuffixLength, String actualNormalSuffix, short formMorphInfo, short normalFormMorphInfo) {
+        this.actualSuffixLength = actualSuffixLength;
        this.actualNormalSuffix = actualNormalSuffix;
        this.formMorphInfo = formMorphInfo;
        this.normalFormMorphInfo = normalFormMorphInfo;
    }

-    public String transofrmWord(String w) {
-        if (w.length() - actualSuffixLengh < 0) return w;
-        return w.substring(0, w.length() - actualSuffixLengh) + actualNormalSuffix;
+    public String transformWord(String w) {
+        if (w.length() - actualSuffixLength < 0) return w;
+        return w.substring(0, w.length() - actualSuffixLength) + actualNormalSuffix;
    }

-    public byte getActualSuffixLengh() {
-        return actualSuffixLengh;
+    public byte getActualSuffixLength() {
+        return actualSuffixLength;
    }

    public String getActualNormalSuffix() {
@@ -67,7 +67,7 @@ public class Heuristic implements Serializable {

        Heuristic heuristic = (Heuristic) o;

-        if (actualSuffixLengh != heuristic.actualSuffixLengh) return false;
+        if (actualSuffixLength != heuristic.actualSuffixLength) return false;
        if (formMorphInfo != heuristic.formMorphInfo) return false;
        if (normalFormMorphInfo != heuristic.normalFormMorphInfo) return false;
        if (actualNormalSuffix != null ? !actualNormalSuffix.equals(heuristic.actualNormalSuffix) : heuristic.actualNormalSuffix != null)
@@ -78,7 +78,7 @@ public class Heuristic implements Serializable {

    @Override
    public int hashCode() {
-        int result = (int) actualSuffixLengh;
+        int result = (int) actualSuffixLength;
        result = 31 * result + (actualNormalSuffix != null ? actualNormalSuffix.hashCode() : 0);
        result = 31 * result + (int) formMorphInfo;
        result = 31 * result + (int) normalFormMorphInfo;
@@ -87,6 +87,6 @@ public class Heuristic implements Serializable {

    @Override
    public String toString() {
-        return "" + actualSuffixLengh + "|" + actualNormalSuffix + "|" + formMorphInfo + "|" + normalFormMorphInfo;
+        return "" + actualSuffixLength + "|" + actualNormalSuffix + "|" + formMorphInfo + "|" + normalFormMorphInfo;
    }
 }
@@ -55,7 +55,7 @@ public class LuceneMorphology extends MorphologyImpl {
        for (Heuristic heuristic : heuristics) {
            boolean isAdded = true;
            for (Heuristic ch : result) {
-                isAdded = isAdded && !(ch.getActualNormalSuffix().equals(heuristic.getActualNormalSuffix()) && (ch.getActualSuffixLengh() == heuristic.getActualSuffixLengh()));
+                isAdded = isAdded && !(ch.getActualNormalSuffix().equals(heuristic.getActualNormalSuffix()) && (ch.getActualSuffixLength() == heuristic.getActualSuffixLength()));
            }
            if (isAdded) {
                result.add(heuristic);
@@ -68,7 +68,7 @@ public class MorphologyImpl implements Morphology {
        int[] ints = decoderEncoder.encodeToArray(revertWord(s));
        int ruleId = findRuleId(ints);
        for (Heuristic h : rules[rulesId[ruleId]]) {
-            result.add(h.transofrmWord(s));
+            result.add(h.transformWord(s));
        }
        return result;
    }
@@ -78,7 +78,7 @@ public class MorphologyImpl implements Morphology {
        int[] ints = decoderEncoder.encodeToArray(revertWord(s));
        int ruleId = findRuleId(ints);
        for (Heuristic h : rules[rulesId[ruleId]]) {
-            result.add(h.transofrmWord(s) + "|" + grammaInfo[h.getFormMorphInfo()]);
+            result.add(h.transformWord(s) + "|" + grammaInfo[h.getFormMorphInfo()]);
        }
        return result;
    }
@@ -180,9 +180,9 @@ public class MorphologyImpl implements Morphology {
        rules = new Heuristic[amount][];
        for (int i = 0; i < amount; i++) {
            String s1 = bufferedReader.readLine();
-            Integer ruleLenght = Integer.valueOf(s1);
-            rules[i] = new Heuristic[ruleLenght];
-            for (int j = 0; j < ruleLenght; j++) {
+            Integer ruleLength = Integer.valueOf(s1);
+            rules[i] = new Heuristic[ruleLength];
+            for (int j = 0; j < ruleLength; j++) {
                rules[i][j] = new Heuristic(bufferedReader.readLine());
            }
        }
@@ -197,7 +197,6 @@ public class MorphologyImpl implements Morphology {
    }

    private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException {
-        HashSet intetger = new HashSet<Integer>();
        separators = new int[amount][];
        for (int i = 0; i < amount; i++) {
            String s1 = bufferedReader.readLine();
@@ -206,7 +205,6 @@ public class MorphologyImpl implements Morphology {
            for (int j = 0; j < wordLenght; j++) {
                separators[i][j] = Integer.valueOf(bufferedReader.readLine());
            }
-            intetger.add(separators[i][0]);
        }
    }