taking into account word text statistics

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@25 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
Alexander.A.Kuznetsov 2009-05-11 14:45:00 +00:00
parent c68fbb0827
commit ad97c3f275
11 changed files with 39705 additions and 7170 deletions

32459
data/lemma.num Normal file

File diff suppressed because it is too large Load Diff

View File

@ -17,6 +17,7 @@
package org.apache.lucene.russian.morphology;
import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
import org.apache.lucene.russian.morphology.dictonary.FrequentyReader;
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
import org.apache.lucene.russian.morphology.heuristic.Heuristic;
import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors;
@ -33,8 +34,11 @@ public class HeuristicBuilder {
IgnoredFormReader formReader = new IgnoredFormReader("data/igoredFrom.txt");
Set<String> form = formReader.getIngnoredFroms();
FrequentyReader frequentyReader = new FrequentyReader("data/lemma.num");
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form);
StatiticsCollectors statiticsCollectors = new StatiticsCollectors();
StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read());
dictonaryReader.proccess(statiticsCollectors);
Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
Object[] objects = counterCollection.toArray();
@ -46,9 +50,9 @@ public class HeuristicBuilder {
final Heuristic heuristic = new Heuristic();
for (int i = 0; i < objects.length; i++) {
heuristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic());
}
heuristic.writeToFile("src/main/resources/org/apache/lucene/russian/morpholgy/russianSuffixesEvristics.txt");
heuristic.writeToFile("russianSuffixesHeuristic.txt");
}
}

View File

@ -41,7 +41,7 @@ public class SuffixHeuristic {
}
public void readFromResource() throws IOException {
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/russianSuffixesEvristics.txt");
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/russianSuffixesHeuristic.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream));
readFromBufferedRreader(bufferedReader);
}

View File

@ -0,0 +1,56 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.dictonary;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
public class FrequentyReader {
private String fileName;
private String fileEncoding = "windows-1251";
public FrequentyReader(String fileName) {
this.fileName = fileName;
}
public FrequentyReader(String fileName, String fileEncoding) {
this.fileName = fileName;
this.fileEncoding = fileEncoding;
}
public Map<String, Double> read() throws IOException {
Map<String, Double> result = new HashMap<String, Double>();
BufferedReader bufferedReader = new BufferedReader(
new InputStreamReader(
new FileInputStream(fileName), fileEncoding));
String s = bufferedReader.readLine();
while (s != null) {
String[] strings = s.split(" ");
Double value = Double.valueOf(strings[1]);
result.put(strings[2], value);
s = bufferedReader.readLine();
}
return result;
}
}

View File

@ -28,7 +28,7 @@ import java.util.TreeMap;
public class Heuristic {
private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();
public void addEvristic(SuffixHeuristic suffixHeuristic) {
public void addHeuristic(SuffixHeuristic suffixHeuristic) {
Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
Long longs = encodedSuffixesPairs.get(suffix);
if (longs == null) {

View File

@ -26,6 +26,11 @@ import java.util.Map;
public class StatiticsCollectors implements WordProccessor {
Map<SuffixHeuristic, SuffixCounter> statititics = new HashMap<SuffixHeuristic, SuffixCounter>();
private Map<String, Double> wordsFreq;
public StatiticsCollectors(Map<String, Double> wordsFreq) {
this.wordsFreq = wordsFreq;
}
private Integer ignoredCount = 0;
@ -38,7 +43,13 @@ public class StatiticsCollectors implements WordProccessor {
suffixCounter = new SuffixCounter(suffixHeuristic);
statititics.put(suffixHeuristic, suffixCounter);
}
suffixCounter.incrementAmount();
Double freq = wordsFreq.get(wordCard.getCanonicalFrom());
if (freq != null) {
suffixCounter.incrementAmount(1 + Math.log(freq));
} else {
suffixCounter.incrementAmount();
}
}
}

View File

@ -32,7 +32,11 @@ public class SuffixCounter implements Comparable {
amnout++;
}
public SuffixHeuristic getSuffixEvristic() {
public void incrementAmount(Double wordFreq) {
amnout += wordFreq;
}
public SuffixHeuristic getSuffixHeuristic() {
return suffixHeuristic;
}

View File

@ -26,12 +26,12 @@ import java.io.InputStream;
import java.io.InputStreamReader;
public class SuffixEvristicsTest {
public class SuffixHeuristicTest {
@Test
public void testShouldDefineCorretCononicalWordForm() throws IOException {
SuffixHeuristic suffixHeuristic = new SuffixHeuristic();
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt");
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-heuristic-test-data.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
String s = bufferedReader.readLine();
while (s != null) {

View File

@ -4,4 +4,5 @@
победы победа
поэтическая поэтический
произошло произойти
test test
test test
ананасов ананас

View File

@ -9,7 +9,7 @@
заход
на
посадка
все
весь
нормальный
быть
рекомендовать
@ -25,7 +25,7 @@
новолазаревский
антарктида
совершаться
примерный
примерно
один
раз
в