taking into account word text statistics

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@25 d817d54c-26ab-11de-abc9-2f7d1455ff7a
2009-05-11 14:45:00 +00:00
parent c68fbb0827
commit ad97c3f275
11 changed files with 39705 additions and 7170 deletions
--- a/data/lemma.num
+++ b/data/lemma.num
--- a/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java
@@ -17,6 +17,7 @@
 package org.apache.lucene.russian.morphology;

 import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
+import org.apache.lucene.russian.morphology.dictonary.FrequentyReader;
 import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
 import org.apache.lucene.russian.morphology.heuristic.Heuristic;
 import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors;
@@ -33,8 +34,11 @@ public class HeuristicBuilder {
        IgnoredFormReader formReader = new IgnoredFormReader("data/igoredFrom.txt");
        Set<String> form = formReader.getIngnoredFroms();

+        FrequentyReader frequentyReader = new FrequentyReader("data/lemma.num");
+
        DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form);
-        StatiticsCollectors statiticsCollectors = new StatiticsCollectors();
+
+        StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read());
        dictonaryReader.proccess(statiticsCollectors);
        Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
        Object[] objects = counterCollection.toArray();
@@ -46,9 +50,9 @@ public class HeuristicBuilder {

        final Heuristic heuristic = new Heuristic();
        for (int i = 0; i < objects.length; i++) {
-            heuristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
+            heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic());
        }

-        heuristic.writeToFile("src/main/resources/org/apache/lucene/russian/morpholgy/russianSuffixesEvristics.txt");
+        heuristic.writeToFile("russianSuffixesHeuristic.txt");
    }
 }
--- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/SuffixHeuristic.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/SuffixHeuristic.java
@@ -41,7 +41,7 @@ public class SuffixHeuristic {
    }

    public void readFromResource() throws IOException {
-        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/russianSuffixesEvristics.txt");
+        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/russianSuffixesHeuristic.txt");
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream));
        readFromBufferedRreader(bufferedReader);
    }
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/FrequentyReader.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/FrequentyReader.java
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.russian.morphology.dictonary;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.HashMap;
+import java.util.Map;
+
+
+public class FrequentyReader {
+    private String fileName;
+    private String fileEncoding = "windows-1251";
+
+    public FrequentyReader(String fileName) {
+        this.fileName = fileName;
+    }
+
+    public FrequentyReader(String fileName, String fileEncoding) {
+        this.fileName = fileName;
+        this.fileEncoding = fileEncoding;
+    }
+
+
+    public Map<String, Double> read() throws IOException {
+        Map<String, Double> result = new HashMap<String, Double>();
+
+        BufferedReader bufferedReader = new BufferedReader(
+                new InputStreamReader(
+                        new FileInputStream(fileName), fileEncoding));
+        String s = bufferedReader.readLine();
+        while (s != null) {
+            String[] strings = s.split(" ");
+            Double value = Double.valueOf(strings[1]);
+            result.put(strings[2], value);
+            s = bufferedReader.readLine();
+        }
+        return result;
+    }
+}
--- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/Heuristic.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/Heuristic.java
@@ -28,7 +28,7 @@ import java.util.TreeMap;
 public class Heuristic {
    private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();

-    public void addEvristic(SuffixHeuristic suffixHeuristic) {
+    public void addHeuristic(SuffixHeuristic suffixHeuristic) {
        Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
        Long longs = encodedSuffixesPairs.get(suffix);
        if (longs == null) {
--- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/StatiticsCollectors.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/StatiticsCollectors.java
@@ -26,6 +26,11 @@ import java.util.Map;

 public class StatiticsCollectors implements WordProccessor {
    Map<SuffixHeuristic, SuffixCounter> statititics = new HashMap<SuffixHeuristic, SuffixCounter>();
+    private Map<String, Double> wordsFreq;
+
+    public StatiticsCollectors(Map<String, Double> wordsFreq) {
+        this.wordsFreq = wordsFreq;
+    }

    private Integer ignoredCount = 0;

@@ -38,8 +43,14 @@ public class StatiticsCollectors implements WordProccessor {
                suffixCounter = new SuffixCounter(suffixHeuristic);
                statititics.put(suffixHeuristic, suffixCounter);
            }
+            Double freq = wordsFreq.get(wordCard.getCanonicalFrom());
+            if (freq != null) {
+                suffixCounter.incrementAmount(1 + Math.log(freq));
+            } else {
                suffixCounter.incrementAmount();
            }
+
+        }
    }

    public Map<SuffixHeuristic, SuffixCounter> getStatititics() {
--- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/SuffixCounter.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/SuffixCounter.java
@@ -32,7 +32,11 @@ public class SuffixCounter implements Comparable {
        amnout++;
    }

-    public SuffixHeuristic getSuffixEvristic() {
+    public void incrementAmount(Double wordFreq) {
+        amnout += wordFreq;
+    }
+
+    public SuffixHeuristic getSuffixHeuristic() {
        return suffixHeuristic;
    }

--- a/src/main/resources/org/apache/lucene/russian/morphology/russianSuffixesHeuristic.txt
+++ b/src/main/resources/org/apache/lucene/russian/morphology/russianSuffixesHeuristic.txt
--- a/src/test/java/org/apache/lucene/russian/morphology/analayzer/SuffixHeuristicTest.java
+++ b/src/test/java/org/apache/lucene/russian/morphology/analayzer/SuffixHeuristicTest.java
@@ -26,12 +26,12 @@ import java.io.InputStream;
 import java.io.InputStreamReader;


-public class SuffixEvristicsTest {
+public class SuffixHeuristicTest {

    @Test
    public void testShouldDefineCorretCononicalWordForm() throws IOException {
        SuffixHeuristic suffixHeuristic = new SuffixHeuristic();
-        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt");
+        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-heuristic-test-data.txt");
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
        String s = bufferedReader.readLine();
        while (s != null) {
--- a/src/test/resources/org/apache/lucene/russian/morphology/analayzer/suffix-heuristic-test-data.txt
+++ b/src/test/resources/org/apache/lucene/russian/morphology/analayzer/suffix-heuristic-test-data.txt
@@ -5,3 +5,4 @@
 поэтическая поэтический
 произошло произойти
 test test
+ананасов ананас
--- a/src/test/resources/org/apache/lucene/russian/morphology/analayzer/token-of-russian-text.txt
+++ b/src/test/resources/org/apache/lucene/russian/morphology/analayzer/token-of-russian-text.txt
@@ -9,7 +9,7 @@
 заход
 на
 посадка
-все
+весь
 нормальный
 быть
 рекомендовать
@@ -25,7 +25,7 @@
 новолазаревский
 антарктида
 совершаться
-примерный
+примерно
 один
 раз
 в