taking into account word text statistics

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@25 d817d54c-26ab-11de-abc9-2f7d1455ff7a
2009-05-11 14:45:00 +00:00
parent c68fbb0827
commit ad97c3f275
11 changed files with 39705 additions and 7170 deletions
--- a/data/lemma.num
+++ b/data/lemma.num
--- a/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java
@@ -17,6 +17,7 @@
 package org.apache.lucene.russian.morphology;
 import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
 import org.apache.lucene.russian.morphology.dictonary.FrequentyReader;
 import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
 import org.apache.lucene.russian.morphology.heuristic.Heuristic;
 import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors;
@@ -33,8 +34,11 @@ public class HeuristicBuilder {
        IgnoredFormReader formReader = new IgnoredFormReader("data/igoredFrom.txt");
        Set<String> form = formReader.getIngnoredFroms();
        FrequentyReader frequentyReader = new FrequentyReader("data/lemma.num");
        DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form);
-        StatiticsCollectors statiticsCollectors = new StatiticsCollectors();
+
        StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read());
        dictonaryReader.proccess(statiticsCollectors);
        Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
        Object[] objects = counterCollection.toArray();
@@ -46,9 +50,9 @@ public class HeuristicBuilder {
        final Heuristic heuristic = new Heuristic();
        for (int i = 0; i < objects.length; i++) {
-            heuristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
+            heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic());
        }
-        heuristic.writeToFile("src/main/resources/org/apache/lucene/russian/morpholgy/russianSuffixesEvristics.txt");
+        heuristic.writeToFile("russianSuffixesHeuristic.txt");
    }
 }
--- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/SuffixHeuristic.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/SuffixHeuristic.java
@@ -41,7 +41,7 @@ public class SuffixHeuristic {
    }
    public void readFromResource() throws IOException {
-        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/russianSuffixesEvristics.txt");
+        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/russianSuffixesHeuristic.txt");
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream));
        readFromBufferedRreader(bufferedReader);
    }
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/FrequentyReader.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/FrequentyReader.java
@@ -0,0 +1,56 @@
 /**
 * Copyright 2009 Alexander Kuznetsov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.russian.morphology.dictonary;
 import java.io.BufferedReader;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.util.HashMap;
 import java.util.Map;
 public class FrequentyReader {
    private String fileName;
    private String fileEncoding = "windows-1251";
    public FrequentyReader(String fileName) {
        this.fileName = fileName;
    }
    public FrequentyReader(String fileName, String fileEncoding) {
        this.fileName = fileName;
        this.fileEncoding = fileEncoding;
    }
    public Map<String, Double> read() throws IOException {
        Map<String, Double> result = new HashMap<String, Double>();
        BufferedReader bufferedReader = new BufferedReader(
                new InputStreamReader(
                        new FileInputStream(fileName), fileEncoding));
        String s = bufferedReader.readLine();
        while (s != null) {
            String[] strings = s.split(" ");
            Double value = Double.valueOf(strings[1]);
            result.put(strings[2], value);
            s = bufferedReader.readLine();
        }
        return result;
    }
 }
--- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/Heuristic.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/Heuristic.java
@@ -28,7 +28,7 @@ import java.util.TreeMap;
 public class Heuristic {
    private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();
-    public void addEvristic(SuffixHeuristic suffixHeuristic) {
+    public void addHeuristic(SuffixHeuristic suffixHeuristic) {
        Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
        Long longs = encodedSuffixesPairs.get(suffix);
        if (longs == null) {
--- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/StatiticsCollectors.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/StatiticsCollectors.java
@@ -26,6 +26,11 @@ import java.util.Map;
 public class StatiticsCollectors implements WordProccessor {
    Map<SuffixHeuristic, SuffixCounter> statititics = new HashMap<SuffixHeuristic, SuffixCounter>();
    private Map<String, Double> wordsFreq;
    public StatiticsCollectors(Map<String, Double> wordsFreq) {
        this.wordsFreq = wordsFreq;
    }
    private Integer ignoredCount = 0;
@@ -38,7 +43,13 @@ public class StatiticsCollectors implements WordProccessor {
                suffixCounter = new SuffixCounter(suffixHeuristic);
                statititics.put(suffixHeuristic, suffixCounter);
            }
-            suffixCounter.incrementAmount();
+            Double freq = wordsFreq.get(wordCard.getCanonicalFrom());
            if (freq != null) {
                suffixCounter.incrementAmount(1 + Math.log(freq));
            } else {
                suffixCounter.incrementAmount();
            }
        }
    }
--- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/SuffixCounter.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/SuffixCounter.java
@@ -32,7 +32,11 @@ public class SuffixCounter implements Comparable {
        amnout++;
    }
-    public SuffixHeuristic getSuffixEvristic() {
+    public void incrementAmount(Double wordFreq) {
        amnout += wordFreq;
    }
    public SuffixHeuristic getSuffixHeuristic() {
        return suffixHeuristic;
    }
--- a/src/main/resources/org/apache/lucene/russian/morphology/russianSuffixesHeuristic.txt
+++ b/src/main/resources/org/apache/lucene/russian/morphology/russianSuffixesHeuristic.txt
--- a/src/test/java/org/apache/lucene/russian/morphology/analayzer/SuffixHeuristicTest.java
+++ b/src/test/java/org/apache/lucene/russian/morphology/analayzer/SuffixHeuristicTest.java
@@ -26,12 +26,12 @@ import java.io.InputStream;
 import java.io.InputStreamReader;
-public class SuffixEvristicsTest {
+public class SuffixHeuristicTest {
    @Test
    public void testShouldDefineCorretCononicalWordForm() throws IOException {
        SuffixHeuristic suffixHeuristic = new SuffixHeuristic();
-        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt");
+        InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-heuristic-test-data.txt");
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
        String s = bufferedReader.readLine();
        while (s != null) {
--- a/src/test/resources/org/apache/lucene/russian/morphology/analayzer/suffix-heuristic-test-data.txt
+++ b/src/test/resources/org/apache/lucene/russian/morphology/analayzer/suffix-heuristic-test-data.txt
@@ -4,4 +4,5 @@
 победы победа
 поэтическая поэтический
 произошло произойти
-test test
+test test
 ананасов ананас
--- a/src/test/resources/org/apache/lucene/russian/morphology/analayzer/token-of-russian-text.txt
+++ b/src/test/resources/org/apache/lucene/russian/morphology/analayzer/token-of-russian-text.txt
@@ -9,7 +9,7 @@
 заход
 на
 посадка
-все
+весь
 нормальный
 быть
 рекомендовать
@@ -25,7 +25,7 @@
 новолазаревский
 антарктида
 совершаться
-примерный
+примерно
 один
 раз
 в