start working on new version with morphology info

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@37 d817d54c-26ab-11de-abc9-2f7d1455ff7a
2009-08-11 06:05:03 +00:00
parent 214a8e2ebe
commit e4dd3a7a76
12 changed files with 249 additions and 58 deletions
--- a/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java
@ -18,15 +18,18 @@ package org.apache.lucene.russian.morphology;
 import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
 import org.apache.lucene.russian.morphology.dictonary.FrequentyReader;
 import org.apache.lucene.russian.morphology.dictonary.GrammaReader;
 import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
-import org.apache.lucene.russian.morphology.heuristic.Heuristic;
+import org.apache.lucene.russian.morphology.heuristic.HeuristicBySuffixLegth;
 import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors;
 import org.apache.lucene.russian.morphology.heuristic.SuffixCounter;
 import org.apache.lucene.russian.morphology.heuristic.SuffixHeuristic;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Set;
 import java.util.TreeMap;
 public class HeuristicBuilder {
@ -35,10 +38,11 @@ public class HeuristicBuilder {
        Set<String> form = formReader.getIngnoredFroms();
        FrequentyReader frequentyReader = new FrequentyReader("data/lemma.num");
-
+        GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
        DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form);
-        StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read());
+
        StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read(), grammaInfo);
        dictonaryReader.proccess(statiticsCollectors);
        Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
        Object[] objects = counterCollection.toArray();
@ -48,11 +52,52 @@ public class HeuristicBuilder {
            System.out.println(objects[i]);
        }
-        final Heuristic heuristic = new Heuristic();
+        final HeuristicBySuffixLegth heuristic = new HeuristicBySuffixLegth();
        for (int i = 0; i < objects.length; i++) {
            heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic());
        }
-        heuristic.writeToFile("russianSuffixesHeuristic.txt");
+        TreeMap<Integer, Integer> map = new TreeMap<Integer, Integer>();
        int ct = 0;
        for (Set<SuffixHeuristic> s : heuristic.getHeuristics().values()) {
            Integer d = map.get(s.size());
            map.put(s.size(), 1 + (d == null ? 0 : d));
            if (s.size() == 1) {
                ct++;
                continue;
            }
            SuffixHeuristic heuristic1 = s.iterator().next();
            Integer sufixSize = heuristic1.getActualSuffixLength();
            String normalSuffix = heuristic1.getNormalFromSuffix();
            if (heuristic1.getFormSuffix().length() < 6) {
                ct++;
                continue;
            }
            Boolean flag = true;
            if (sufixSize > 3) continue;
            for (SuffixHeuristic sh : s) {
                flag = flag && (sufixSize.equals(sh.getActualSuffixLength()))
                        && (normalSuffix.equals(sh.getNormalFromSuffix()));
            }
            if (flag) {
                System.out.println(s);
                ct++;
            }
            //HashSet<String> integers = new HashSet<String>();
 //            for(SuffixHeuristic sh:s){
 //                integers.add(sh.getMorphInfoCode());
 //            }
 //            if(s.size() == integers.size()){
 //                ct++;
 //            }else{
 //               if(s.size() == 2) System.out.println(s);
 //            }
        }
        System.out.println(objects.length);
        System.out.println(heuristic.getHeuristics().size());
        System.out.println(ct);
        System.out.println(map);
        //heuristic.writeToFile("russianSuffixesHeuristic.txt");
    }
 }
--- a/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java
@ -24,7 +24,7 @@ package org.apache.lucene.russian.morphology;
 */
 public class RussianSuffixDecoderEncoder {
    public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
-    public static final int SUFFIX_LENGTH = 7;
+    public static final int SUFFIX_LENGTH = 6;
    public static final int EE_CHAR = 34;
    public static final int E_CHAR = 6;
    public static final int DASH_CHAR = 45;
--- a/src/main/java/org/apache/lucene/russian/morphology/Test.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/Test.java
@ -0,0 +1,13 @@
 package org.apache.lucene.russian.morphology;
 import org.apache.lucene.russian.morphology.dictonary.GrammaReader;
 import java.io.IOException;
 public class Test {
    public static void main(String[] args) throws IOException {
        GrammaReader grammaReader = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
        System.out.println(grammaReader.getInversIndex().size());
    }
 }
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java
@ -63,17 +63,18 @@ public class DictonaryReader {
        int count = Integer.valueOf(s);
        for (int i = 0; i < count; i++) {
            s = reader.readLine();
-            if (i % 10000 == 0) System.out.println("Proccess " + i + " word of " + count);
+            if (i % 10000 == 0) System.out.println("Proccess " + i + " wordBase of " + count);
            String[] wd = s.split(" ");
-            String word = wd[0].toLowerCase();
+            String wordBase = wd[0].toLowerCase();
-            if (word.startsWith("-")) continue;
+            if (wordBase.startsWith("-")) continue;
-            word = "#".equals(word) ? "" : word;
+            wordBase = "#".equals(wordBase) ? "" : wordBase;
            List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
-            if (models.size() > 0 && !ingnoredForm.contains(models.get(0).getCode())) {
+            FlexiaModel flexiaModel = models.get(0);
-                WordCard card = new WordCard(cleanString(models.get(0).create(word)));
+            if (models.size() > 0 && !ingnoredForm.contains(flexiaModel.getCode())) {
                WordCard card = new WordCard(cleanString(flexiaModel.create(wordBase)), cleanString(wordBase), flexiaModel.getSuffix());
                for (FlexiaModel fm : models) {
-                    card.addFrom(cleanString(fm.create(word)));
+                    card.addFlexia(fm);
                }
                wordProccessor.proccess(card);
            }
@ -118,9 +119,10 @@ public class DictonaryReader {
    private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
        String[] fl = line.split("\\*");
        // we inored all forms thats
-        //  if (fl.length == 3)
+        if (fl.length == 3) {
-        //      flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
+            flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase())));
-        if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
+        }
        if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), ""));
    }
 }
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/GrammaReader.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/GrammaReader.java
@ -0,0 +1,58 @@
 package org.apache.lucene.russian.morphology.dictonary;
 import java.io.BufferedReader;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.util.HashMap;
 import java.util.Map;
 //todo spleet this class on two.
 public class GrammaReader {
    private String fileName;
    private String fileEncoding = "windows-1251";
    private Map<Integer, String> grammaInfo = new HashMap<Integer, String>();
    private Map<String, Integer> inversIndex = new HashMap<String, Integer>();
    public GrammaReader(String fileName) throws IOException {
        this.fileName = fileName;
        setUp();
    }
    public GrammaReader(String fileName, String fileEncoding) throws IOException {
        this.fileName = fileName;
        this.fileEncoding = fileEncoding;
        setUp();
    }
    private void setUp() throws IOException {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), fileEncoding));
        String line = bufferedReader.readLine();
        while (line != null) {
            line = line.trim();
            if (!line.startsWith("//") && line.length() > 0) {
                String[] strings = line.split(" ", 2);
                Integer i = grammaInfo.size();
                inversIndex.put(strings[0], i);
                grammaInfo.put(i, strings[1]);
            }
            line = bufferedReader.readLine();
        }
    }
    public Map<Integer, String> getGrammaInfo() {
        return grammaInfo;
    }
    public void setGrammaInfo(Map<Integer, String> grammaInfo) {
        this.grammaInfo = grammaInfo;
    }
    public Map<String, Integer> getInversIndex() {
        return inversIndex;
    }
    public void setInversIndex(Map<String, Integer> inversIndex) {
        this.inversIndex = inversIndex;
    }
 }
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java
@ -24,21 +24,33 @@ import java.util.List;
 */
 public class WordCard {
    private String canonicalFrom;
-    private List<String> wordsFroms = new ArrayList<String>();
+    private String base;
    private String canonicalSuffix;
    private List<FlexiaModel> wordsFroms = new ArrayList<FlexiaModel>();
-    protected WordCard(String canonicalFrom) {
+    public WordCard(String canonicalFrom, String base, String canonicalSuffix) {
        this.canonicalFrom = canonicalFrom;
        this.canonicalSuffix = canonicalSuffix;
        this.base = base;
    }
-    protected void addFrom(String word) {
+    public void addFlexia(FlexiaModel flexiaModel) {
-        wordsFroms.add(word);
+        wordsFroms.add(flexiaModel);
    }
    public String getCanonicalFrom() {
        return canonicalFrom;
    }
-    public List<String> getWordsFroms() {
+    public String getCanonicalSuffix() {
        return canonicalSuffix;
    }
    public String getBase() {
        return base;
    }
    public List<FlexiaModel> getWordsFroms() {
        return wordsFroms;
    }
 }
--- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/Heuristic.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/Heuristic.java
@ -29,11 +29,11 @@ public class Heuristic {
    private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();
    public void addHeuristic(SuffixHeuristic suffixHeuristic) {
-        Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
+//        Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
-        Long longs = encodedSuffixesPairs.get(suffix);
+//        Long longs = encodedSuffixesPairs.get(suffix);
-        if (longs == null) {
+//        if (longs == null) {
-            encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixHeuristic.getNormalSuffix()));
+//            encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixHeuristic.getNormalSuffix()));
-        }
+//        }
    }
    public String getNormalForm(String form) {
@ -49,6 +49,10 @@ public class Heuristic {
        return form;
    }
    public Integer getAmount() {
        return encodedSuffixesPairs.size();
    }
    public void readFromFile(String file) throws IOException {
        BufferedReader reader = new BufferedReader(new FileReader(file));
        String s = reader.readLine();
--- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/HeuristicBySuffixLegth.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/HeuristicBySuffixLegth.java
@ -0,0 +1,27 @@
 package org.apache.lucene.russian.morphology.heuristic;
 import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
 public class HeuristicBySuffixLegth {
    private Map<Long, Set<SuffixHeuristic>> heuristics = new HashMap<Long, Set<SuffixHeuristic>>();
    public void addHeuristic(SuffixHeuristic suffixHeuristic) {
        Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
        Set<SuffixHeuristic> suffixHeuristics = heuristics.get(suffix);
        if (suffixHeuristics == null) {
            suffixHeuristics = new HashSet<SuffixHeuristic>();
            heuristics.put(suffix, suffixHeuristics);
        }
        suffixHeuristics.add(suffixHeuristic);
    }
    public Map<Long, Set<SuffixHeuristic>> getHeuristics() {
        return heuristics;
    }
 }
--- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/StatiticsCollectors.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/StatiticsCollectors.java
@ -17,6 +17,8 @@
 package org.apache.lucene.russian.morphology.heuristic;
 import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
 import org.apache.lucene.russian.morphology.dictonary.FlexiaModel;
 import org.apache.lucene.russian.morphology.dictonary.GrammaReader;
 import org.apache.lucene.russian.morphology.dictonary.WordCard;
 import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
@ -27,16 +29,18 @@ import java.util.Map;
 public class StatiticsCollectors implements WordProccessor {
    Map<SuffixHeuristic, SuffixCounter> statititics = new HashMap<SuffixHeuristic, SuffixCounter>();
    private Map<String, Double> wordsFreq;
    private GrammaReader grammaInfo;
-    public StatiticsCollectors(Map<String, Double> wordsFreq) {
+    public StatiticsCollectors(Map<String, Double> wordsFreq, GrammaReader grammaInfo) {
        this.wordsFreq = wordsFreq;
        this.grammaInfo = grammaInfo;
    }
    private Integer ignoredCount = 0;
    public void proccess(WordCard wordCard) {
-        for (String form : wordCard.getWordsFroms()) {
+        for (FlexiaModel fm : wordCard.getWordsFroms()) {
-            SuffixHeuristic suffixHeuristic = createEvristic(wordCard.getCanonicalFrom(), form);
+            SuffixHeuristic suffixHeuristic = createEvristic(wordCard.getCanonicalFrom(), wordCard.getCanonicalSuffix(), fm);
            if (suffixHeuristic == null) continue;
            SuffixCounter suffixCounter = statititics.get(suffixHeuristic);
            if (suffixCounter == null) {
@ -57,19 +61,23 @@ public class StatiticsCollectors implements WordProccessor {
        return statititics;
    }
-    private SuffixHeuristic createEvristic(String word, String form) {
+    private SuffixHeuristic createEvristic(String wordBase, String canonicalSuffix, FlexiaModel fm) {
        String form = fm.create(wordBase);
        int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
        String formSuffix = form.substring(startSymbol);
-        if (word.length() < startSymbol) {
+        String actualSuffix = fm.getSuffix();
-            ignoredCount++;
+        Integer actualSuffixLengh = actualSuffix.length();
-            return null;
+//        if (word.length() < startSymbol) {
-        }
+//            ignoredCount++;
-        String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : "";
+//            return null;
-        if (wordSuffix.length() > 12) {
+//        }
-            System.out.println(word + " " + form);
+//        String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : "";
-            return null;
+//        if (wordSuffix.length() > 12) {
-        }
+//            System.out.println(word + " " + form);
-        return new SuffixHeuristic(formSuffix, wordSuffix);
+//            return null;
 //        }
 //        return new SuffixHeuristic(formSuffix, wordSuffix);
        return new SuffixHeuristic(formSuffix, actualSuffixLengh, canonicalSuffix, fm.getCode());
    }
--- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/SuffixHeuristic.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/SuffixHeuristic.java
@ -24,27 +24,31 @@ package org.apache.lucene.russian.morphology.heuristic;
 */
 public class SuffixHeuristic {
    private String formSuffix;
-    private String normalSuffix;
+    private Integer actualSuffixLength;
    private String normalFromSuffix;
    private String morphInfoCode;
-    public SuffixHeuristic(String formSuffix, String normalSuffix) {
+    public SuffixHeuristic(String formSuffix, Integer actualSuffixLength, String normalFromSuffix, String morphInfoCode) {
        this.formSuffix = formSuffix;
-        this.normalSuffix = normalSuffix;
+        this.actualSuffixLength = actualSuffixLength;
        this.normalFromSuffix = normalFromSuffix;
        this.morphInfoCode = morphInfoCode;
    }
    public String getFormSuffix() {
        return formSuffix;
    }
-    public void setFormSuffix(String formSuffix) {
+    public Integer getActualSuffixLength() {
-        this.formSuffix = formSuffix;
+        return actualSuffixLength;
    }
-    public String getNormalSuffix() {
+    public String getNormalFromSuffix() {
-        return normalSuffix;
+        return normalFromSuffix;
    }
-    public void setNormalSuffix(String normalSuffix) {
+    public String getMorphInfoCode() {
-        this.normalSuffix = normalSuffix;
+        return morphInfoCode;
    }
    @Override
@ -54,24 +58,28 @@ public class SuffixHeuristic {
        SuffixHeuristic that = (SuffixHeuristic) o;
-        if (!formSuffix.equals(that.formSuffix)) return false;
+        if (actualSuffixLength != null ? !actualSuffixLength.equals(that.actualSuffixLength) : that.actualSuffixLength != null)
-        if (!normalSuffix.equals(that.normalSuffix)) return false;
+            return false;
        if (formSuffix != null ? !formSuffix.equals(that.formSuffix) : that.formSuffix != null) return false;
        if (morphInfoCode != null ? !morphInfoCode.equals(that.morphInfoCode) : that.morphInfoCode != null)
            return false;
        if (normalFromSuffix != null ? !normalFromSuffix.equals(that.normalFromSuffix) : that.normalFromSuffix != null)
            return false;
        return true;
    }
    @Override
    public int hashCode() {
-        int result = formSuffix.hashCode();
+        int result = formSuffix != null ? formSuffix.hashCode() : 0;
-        result = 31 * result + normalSuffix.hashCode();
+        result = 31 * result + (actualSuffixLength != null ? actualSuffixLength.hashCode() : 0);
        result = 31 * result + (normalFromSuffix != null ? normalFromSuffix.hashCode() : 0);
        result = 31 * result + (morphInfoCode != null ? morphInfoCode.hashCode() : 0);
        return result;
    }
    @Override
    public String toString() {
-        return "SuffixHeuristic{" +
+        return formSuffix + " " + actualSuffixLength + " " + normalFromSuffix + " " + morphInfoCode;
                "formSuffix='" + formSuffix + '\'' +
                ", normalSuffix='" + normalSuffix + '\'' +
                '}';
    }
 }
--- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/SuffixHeuristicMerger.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/SuffixHeuristicMerger.java
@ -0,0 +1,13 @@
 package org.apache.lucene.russian.morphology.heuristic;
 public class SuffixHeuristicMerger {
    public SuffixHeuristic merge(SuffixHeuristic one, SuffixHeuristic two) {
        if (!one.getMorphInfoCode().equals(two.getMorphInfoCode()))
            return null;
        SuffixHeuristic min = one.getActualSuffixLength() > two.getActualSuffixLength() ? two : one;
        return null;
    }
 }
--- a/src/test/resources/org/apache/lucene/russian/morphology/analayzer/suffix-heuristic-test-data.txt
+++ b/src/test/resources/org/apache/lucene/russian/morphology/analayzer/suffix-heuristic-test-data.txt
@ -6,3 +6,4 @@
 произошло произойти
 test test
 ананасов ананас
 встовашего встовать