start working on new version with morphology info

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@37 d817d54c-26ab-11de-abc9-2f7d1455ff7a
2009-08-11 06:05:03 +00:00
parent 214a8e2ebe
commit e4dd3a7a76
12 changed files with 249 additions and 58 deletions
--- a/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java
@@ -18,15 +18,18 @@ package org.apache.lucene.russian.morphology;

 import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
 import org.apache.lucene.russian.morphology.dictonary.FrequentyReader;
+import org.apache.lucene.russian.morphology.dictonary.GrammaReader;
 import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
-import org.apache.lucene.russian.morphology.heuristic.Heuristic;
+import org.apache.lucene.russian.morphology.heuristic.HeuristicBySuffixLegth;
 import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors;
 import org.apache.lucene.russian.morphology.heuristic.SuffixCounter;
+import org.apache.lucene.russian.morphology.heuristic.SuffixHeuristic;

 import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Set;
+import java.util.TreeMap;


 public class HeuristicBuilder {
@@ -35,10 +38,11 @@ public class HeuristicBuilder {
        Set<String> form = formReader.getIngnoredFroms();

        FrequentyReader frequentyReader = new FrequentyReader("data/lemma.num");
-
+        GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
        DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form);

-        StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read());
+
+        StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read(), grammaInfo);
        dictonaryReader.proccess(statiticsCollectors);
        Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
        Object[] objects = counterCollection.toArray();
@@ -48,11 +52,52 @@ public class HeuristicBuilder {
            System.out.println(objects[i]);
        }

-        final Heuristic heuristic = new Heuristic();
+        final HeuristicBySuffixLegth heuristic = new HeuristicBySuffixLegth();
        for (int i = 0; i < objects.length; i++) {
            heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic());
        }

-        heuristic.writeToFile("russianSuffixesHeuristic.txt");
+        TreeMap<Integer, Integer> map = new TreeMap<Integer, Integer>();
+
+        int ct = 0;
+        for (Set<SuffixHeuristic> s : heuristic.getHeuristics().values()) {
+            Integer d = map.get(s.size());
+            map.put(s.size(), 1 + (d == null ? 0 : d));
+            if (s.size() == 1) {
+                ct++;
+                continue;
+            }
+            SuffixHeuristic heuristic1 = s.iterator().next();
+            Integer sufixSize = heuristic1.getActualSuffixLength();
+            String normalSuffix = heuristic1.getNormalFromSuffix();
+            if (heuristic1.getFormSuffix().length() < 6) {
+                ct++;
+                continue;
+            }
+            Boolean flag = true;
+            if (sufixSize > 3) continue;
+            for (SuffixHeuristic sh : s) {
+                flag = flag && (sufixSize.equals(sh.getActualSuffixLength()))
+                        && (normalSuffix.equals(sh.getNormalFromSuffix()));
+            }
+            if (flag) {
+                System.out.println(s);
+                ct++;
+            }
+            //HashSet<String> integers = new HashSet<String>();
+//            for(SuffixHeuristic sh:s){
+//                integers.add(sh.getMorphInfoCode());
+//            }
+//            if(s.size() == integers.size()){
+//                ct++;
+//            }else{
+//               if(s.size() == 2) System.out.println(s);
+//            }
+        }
+        System.out.println(objects.length);
+        System.out.println(heuristic.getHeuristics().size());
+        System.out.println(ct);
+        System.out.println(map);
+        //heuristic.writeToFile("russianSuffixesHeuristic.txt");
    }
 }
--- a/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java
@@ -24,7 +24,7 @@ package org.apache.lucene.russian.morphology;
 */
 public class RussianSuffixDecoderEncoder {
    public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
-    public static final int SUFFIX_LENGTH = 7;
+    public static final int SUFFIX_LENGTH = 6;
    public static final int EE_CHAR = 34;
    public static final int E_CHAR = 6;
    public static final int DASH_CHAR = 45;
--- a/src/main/java/org/apache/lucene/russian/morphology/Test.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/Test.java
@@ -0,0 +1,13 @@
+package org.apache.lucene.russian.morphology;
+
+import org.apache.lucene.russian.morphology.dictonary.GrammaReader;
+
+import java.io.IOException;
+
+
+public class Test {
+    public static void main(String[] args) throws IOException {
+        GrammaReader grammaReader = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
+        System.out.println(grammaReader.getInversIndex().size());
+    }
+}
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java
@@ -63,17 +63,18 @@ public class DictonaryReader {
        int count = Integer.valueOf(s);
        for (int i = 0; i < count; i++) {
            s = reader.readLine();
-            if (i % 10000 == 0) System.out.println("Proccess " + i + " word of " + count);
+            if (i % 10000 == 0) System.out.println("Proccess " + i + " wordBase of " + count);

            String[] wd = s.split(" ");
-            String word = wd[0].toLowerCase();
-            if (word.startsWith("-")) continue;
-            word = "#".equals(word) ? "" : word;
+            String wordBase = wd[0].toLowerCase();
+            if (wordBase.startsWith("-")) continue;
+            wordBase = "#".equals(wordBase) ? "" : wordBase;
            List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
-            if (models.size() > 0 && !ingnoredForm.contains(models.get(0).getCode())) {
-                WordCard card = new WordCard(cleanString(models.get(0).create(word)));
+            FlexiaModel flexiaModel = models.get(0);
+            if (models.size() > 0 && !ingnoredForm.contains(flexiaModel.getCode())) {
+                WordCard card = new WordCard(cleanString(flexiaModel.create(wordBase)), cleanString(wordBase), flexiaModel.getSuffix());
                for (FlexiaModel fm : models) {
-                    card.addFrom(cleanString(fm.create(word)));
+                    card.addFlexia(fm);
                }
                wordProccessor.proccess(card);
            }
@@ -118,9 +119,10 @@ public class DictonaryReader {
    private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
        String[] fl = line.split("\\*");
        // we inored all forms thats
-        //  if (fl.length == 3)
-        //      flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
-        if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
+        if (fl.length == 3) {
+            flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase())));
+        }
+        if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), ""));
    }

 }
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/GrammaReader.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/GrammaReader.java
@@ -0,0 +1,58 @@
+package org.apache.lucene.russian.morphology.dictonary;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.HashMap;
+import java.util.Map;
+
+//todo spleet this class on two.
+public class GrammaReader {
+    private String fileName;
+    private String fileEncoding = "windows-1251";
+    private Map<Integer, String> grammaInfo = new HashMap<Integer, String>();
+    private Map<String, Integer> inversIndex = new HashMap<String, Integer>();
+
+    public GrammaReader(String fileName) throws IOException {
+        this.fileName = fileName;
+        setUp();
+    }
+
+    public GrammaReader(String fileName, String fileEncoding) throws IOException {
+        this.fileName = fileName;
+        this.fileEncoding = fileEncoding;
+        setUp();
+    }
+
+    private void setUp() throws IOException {
+        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), fileEncoding));
+        String line = bufferedReader.readLine();
+        while (line != null) {
+            line = line.trim();
+            if (!line.startsWith("//") && line.length() > 0) {
+                String[] strings = line.split(" ", 2);
+                Integer i = grammaInfo.size();
+                inversIndex.put(strings[0], i);
+                grammaInfo.put(i, strings[1]);
+            }
+            line = bufferedReader.readLine();
+        }
+    }
+
+    public Map<Integer, String> getGrammaInfo() {
+        return grammaInfo;
+    }
+
+    public void setGrammaInfo(Map<Integer, String> grammaInfo) {
+        this.grammaInfo = grammaInfo;
+    }
+
+    public Map<String, Integer> getInversIndex() {
+        return inversIndex;
+    }
+
+    public void setInversIndex(Map<String, Integer> inversIndex) {
+        this.inversIndex = inversIndex;
+    }
+}
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java
@@ -24,21 +24,33 @@ import java.util.List;
 */
 public class WordCard {
    private String canonicalFrom;
-    private List<String> wordsFroms = new ArrayList<String>();
+    private String base;
+    private String canonicalSuffix;
+    private List<FlexiaModel> wordsFroms = new ArrayList<FlexiaModel>();

-    protected WordCard(String canonicalFrom) {
+    public WordCard(String canonicalFrom, String base, String canonicalSuffix) {
        this.canonicalFrom = canonicalFrom;
+        this.canonicalSuffix = canonicalSuffix;
+        this.base = base;
    }

-    protected void addFrom(String word) {
-        wordsFroms.add(word);
+    public void addFlexia(FlexiaModel flexiaModel) {
+        wordsFroms.add(flexiaModel);
    }

    public String getCanonicalFrom() {
        return canonicalFrom;
    }

-    public List<String> getWordsFroms() {
+    public String getCanonicalSuffix() {
+        return canonicalSuffix;
+    }
+
+    public String getBase() {
+        return base;
+    }
+
+    public List<FlexiaModel> getWordsFroms() {
        return wordsFroms;
    }
 }
--- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/Heuristic.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/Heuristic.java
@@ -29,11 +29,11 @@ public class Heuristic {
    private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();

    public void addHeuristic(SuffixHeuristic suffixHeuristic) {
-        Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
-        Long longs = encodedSuffixesPairs.get(suffix);
-        if (longs == null) {
-            encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixHeuristic.getNormalSuffix()));
-        }
+//        Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
+//        Long longs = encodedSuffixesPairs.get(suffix);
+//        if (longs == null) {
+//            encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixHeuristic.getNormalSuffix()));
+//        }
    }

    public String getNormalForm(String form) {
@@ -49,6 +49,10 @@ public class Heuristic {
        return form;
    }

+    public Integer getAmount() {
+        return encodedSuffixesPairs.size();
+    }
+
    public void readFromFile(String file) throws IOException {
        BufferedReader reader = new BufferedReader(new FileReader(file));
        String s = reader.readLine();
--- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/HeuristicBySuffixLegth.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/HeuristicBySuffixLegth.java
@@ -0,0 +1,27 @@
+package org.apache.lucene.russian.morphology.heuristic;
+
+import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+
+public class HeuristicBySuffixLegth {
+    private Map<Long, Set<SuffixHeuristic>> heuristics = new HashMap<Long, Set<SuffixHeuristic>>();
+
+    public void addHeuristic(SuffixHeuristic suffixHeuristic) {
+        Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
+        Set<SuffixHeuristic> suffixHeuristics = heuristics.get(suffix);
+        if (suffixHeuristics == null) {
+            suffixHeuristics = new HashSet<SuffixHeuristic>();
+            heuristics.put(suffix, suffixHeuristics);
+        }
+        suffixHeuristics.add(suffixHeuristic);
+    }
+
+    public Map<Long, Set<SuffixHeuristic>> getHeuristics() {
+        return heuristics;
+    }
+}
--- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/StatiticsCollectors.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/StatiticsCollectors.java
@@ -17,6 +17,8 @@
 package org.apache.lucene.russian.morphology.heuristic;

 import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
+import org.apache.lucene.russian.morphology.dictonary.FlexiaModel;
+import org.apache.lucene.russian.morphology.dictonary.GrammaReader;
 import org.apache.lucene.russian.morphology.dictonary.WordCard;
 import org.apache.lucene.russian.morphology.dictonary.WordProccessor;

@@ -27,16 +29,18 @@ import java.util.Map;
 public class StatiticsCollectors implements WordProccessor {
    Map<SuffixHeuristic, SuffixCounter> statititics = new HashMap<SuffixHeuristic, SuffixCounter>();
    private Map<String, Double> wordsFreq;
+    private GrammaReader grammaInfo;

-    public StatiticsCollectors(Map<String, Double> wordsFreq) {
+    public StatiticsCollectors(Map<String, Double> wordsFreq, GrammaReader grammaInfo) {
        this.wordsFreq = wordsFreq;
+        this.grammaInfo = grammaInfo;
    }

    private Integer ignoredCount = 0;

    public void proccess(WordCard wordCard) {
-        for (String form : wordCard.getWordsFroms()) {
-            SuffixHeuristic suffixHeuristic = createEvristic(wordCard.getCanonicalFrom(), form);
+        for (FlexiaModel fm : wordCard.getWordsFroms()) {
+            SuffixHeuristic suffixHeuristic = createEvristic(wordCard.getCanonicalFrom(), wordCard.getCanonicalSuffix(), fm);
            if (suffixHeuristic == null) continue;
            SuffixCounter suffixCounter = statititics.get(suffixHeuristic);
            if (suffixCounter == null) {
@@ -57,19 +61,23 @@ public class StatiticsCollectors implements WordProccessor {
        return statititics;
    }

-    private SuffixHeuristic createEvristic(String word, String form) {
+    private SuffixHeuristic createEvristic(String wordBase, String canonicalSuffix, FlexiaModel fm) {
+        String form = fm.create(wordBase);
        int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
        String formSuffix = form.substring(startSymbol);
-        if (word.length() < startSymbol) {
-            ignoredCount++;
-            return null;
-        }
-        String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : "";
-        if (wordSuffix.length() > 12) {
-            System.out.println(word + " " + form);
-            return null;
-        }
-        return new SuffixHeuristic(formSuffix, wordSuffix);
+        String actualSuffix = fm.getSuffix();
+        Integer actualSuffixLengh = actualSuffix.length();
+//        if (word.length() < startSymbol) {
+//            ignoredCount++;
+//            return null;
+//        }
+//        String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : "";
+//        if (wordSuffix.length() > 12) {
+//            System.out.println(word + " " + form);
+//            return null;
+//        }
+//        return new SuffixHeuristic(formSuffix, wordSuffix);
+        return new SuffixHeuristic(formSuffix, actualSuffixLengh, canonicalSuffix, fm.getCode());
    }


--- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/SuffixHeuristic.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/SuffixHeuristic.java
@@ -24,27 +24,31 @@ package org.apache.lucene.russian.morphology.heuristic;
 */
 public class SuffixHeuristic {
    private String formSuffix;
-    private String normalSuffix;
+    private Integer actualSuffixLength;
+    private String normalFromSuffix;
+    private String morphInfoCode;

-    public SuffixHeuristic(String formSuffix, String normalSuffix) {
+    public SuffixHeuristic(String formSuffix, Integer actualSuffixLength, String normalFromSuffix, String morphInfoCode) {
        this.formSuffix = formSuffix;
-        this.normalSuffix = normalSuffix;
+        this.actualSuffixLength = actualSuffixLength;
+        this.normalFromSuffix = normalFromSuffix;
+        this.morphInfoCode = morphInfoCode;
    }

    public String getFormSuffix() {
        return formSuffix;
    }

-    public void setFormSuffix(String formSuffix) {
-        this.formSuffix = formSuffix;
+    public Integer getActualSuffixLength() {
+        return actualSuffixLength;
    }

-    public String getNormalSuffix() {
-        return normalSuffix;
+    public String getNormalFromSuffix() {
+        return normalFromSuffix;
    }

-    public void setNormalSuffix(String normalSuffix) {
-        this.normalSuffix = normalSuffix;
+    public String getMorphInfoCode() {
+        return morphInfoCode;
    }

    @Override
@@ -54,24 +58,28 @@ public class SuffixHeuristic {

        SuffixHeuristic that = (SuffixHeuristic) o;

-        if (!formSuffix.equals(that.formSuffix)) return false;
-        if (!normalSuffix.equals(that.normalSuffix)) return false;
+        if (actualSuffixLength != null ? !actualSuffixLength.equals(that.actualSuffixLength) : that.actualSuffixLength != null)
+            return false;
+        if (formSuffix != null ? !formSuffix.equals(that.formSuffix) : that.formSuffix != null) return false;
+        if (morphInfoCode != null ? !morphInfoCode.equals(that.morphInfoCode) : that.morphInfoCode != null)
+            return false;
+        if (normalFromSuffix != null ? !normalFromSuffix.equals(that.normalFromSuffix) : that.normalFromSuffix != null)
+            return false;

        return true;
    }

    @Override
    public int hashCode() {
-        int result = formSuffix.hashCode();
-        result = 31 * result + normalSuffix.hashCode();
+        int result = formSuffix != null ? formSuffix.hashCode() : 0;
+        result = 31 * result + (actualSuffixLength != null ? actualSuffixLength.hashCode() : 0);
+        result = 31 * result + (normalFromSuffix != null ? normalFromSuffix.hashCode() : 0);
+        result = 31 * result + (morphInfoCode != null ? morphInfoCode.hashCode() : 0);
        return result;
    }

    @Override
    public String toString() {
-        return "SuffixHeuristic{" +
-                "formSuffix='" + formSuffix + '\'' +
-                ", normalSuffix='" + normalSuffix + '\'' +
-                '}';
+        return formSuffix + " " + actualSuffixLength + " " + normalFromSuffix + " " + morphInfoCode;
    }
 }
--- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/SuffixHeuristicMerger.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/SuffixHeuristicMerger.java
@@ -0,0 +1,13 @@
+package org.apache.lucene.russian.morphology.heuristic;
+
+
+public class SuffixHeuristicMerger {
+
+    public SuffixHeuristic merge(SuffixHeuristic one, SuffixHeuristic two) {
+        if (!one.getMorphInfoCode().equals(two.getMorphInfoCode()))
+            return null;
+        SuffixHeuristic min = one.getActualSuffixLength() > two.getActualSuffixLength() ? two : one;
+
+        return null;
+    }
+}
--- a/src/test/resources/org/apache/lucene/russian/morphology/analayzer/suffix-heuristic-test-data.txt
+++ b/src/test/resources/org/apache/lucene/russian/morphology/analayzer/suffix-heuristic-test-data.txt
@@ -6,3 +6,4 @@
 произошло произойти
 test test
 ананасов ананас
+встовашего встовать