taking into account word text statistics
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@25 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
c68fbb0827
commit
ad97c3f275
32459
data/lemma.num
Normal file
32459
data/lemma.num
Normal file
File diff suppressed because it is too large
Load Diff
@ -17,6 +17,7 @@
|
|||||||
package org.apache.lucene.russian.morphology;
|
package org.apache.lucene.russian.morphology;
|
||||||
|
|
||||||
import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
|
import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
|
||||||
|
import org.apache.lucene.russian.morphology.dictonary.FrequentyReader;
|
||||||
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
|
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
|
||||||
import org.apache.lucene.russian.morphology.heuristic.Heuristic;
|
import org.apache.lucene.russian.morphology.heuristic.Heuristic;
|
||||||
import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors;
|
import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors;
|
||||||
@ -33,8 +34,11 @@ public class HeuristicBuilder {
|
|||||||
IgnoredFormReader formReader = new IgnoredFormReader("data/igoredFrom.txt");
|
IgnoredFormReader formReader = new IgnoredFormReader("data/igoredFrom.txt");
|
||||||
Set<String> form = formReader.getIngnoredFroms();
|
Set<String> form = formReader.getIngnoredFroms();
|
||||||
|
|
||||||
|
FrequentyReader frequentyReader = new FrequentyReader("data/lemma.num");
|
||||||
|
|
||||||
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form);
|
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form);
|
||||||
StatiticsCollectors statiticsCollectors = new StatiticsCollectors();
|
|
||||||
|
StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read());
|
||||||
dictonaryReader.proccess(statiticsCollectors);
|
dictonaryReader.proccess(statiticsCollectors);
|
||||||
Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
|
Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
|
||||||
Object[] objects = counterCollection.toArray();
|
Object[] objects = counterCollection.toArray();
|
||||||
@ -46,9 +50,9 @@ public class HeuristicBuilder {
|
|||||||
|
|
||||||
final Heuristic heuristic = new Heuristic();
|
final Heuristic heuristic = new Heuristic();
|
||||||
for (int i = 0; i < objects.length; i++) {
|
for (int i = 0; i < objects.length; i++) {
|
||||||
heuristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
|
heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic());
|
||||||
}
|
}
|
||||||
|
|
||||||
heuristic.writeToFile("src/main/resources/org/apache/lucene/russian/morpholgy/russianSuffixesEvristics.txt");
|
heuristic.writeToFile("russianSuffixesHeuristic.txt");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -41,7 +41,7 @@ public class SuffixHeuristic {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void readFromResource() throws IOException {
|
public void readFromResource() throws IOException {
|
||||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/russianSuffixesEvristics.txt");
|
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/russianSuffixesHeuristic.txt");
|
||||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream));
|
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream));
|
||||||
readFromBufferedRreader(bufferedReader);
|
readFromBufferedRreader(bufferedReader);
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,56 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.russian.morphology.dictonary;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
|
||||||
|
public class FrequentyReader {
|
||||||
|
private String fileName;
|
||||||
|
private String fileEncoding = "windows-1251";
|
||||||
|
|
||||||
|
public FrequentyReader(String fileName) {
|
||||||
|
this.fileName = fileName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public FrequentyReader(String fileName, String fileEncoding) {
|
||||||
|
this.fileName = fileName;
|
||||||
|
this.fileEncoding = fileEncoding;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Map<String, Double> read() throws IOException {
|
||||||
|
Map<String, Double> result = new HashMap<String, Double>();
|
||||||
|
|
||||||
|
BufferedReader bufferedReader = new BufferedReader(
|
||||||
|
new InputStreamReader(
|
||||||
|
new FileInputStream(fileName), fileEncoding));
|
||||||
|
String s = bufferedReader.readLine();
|
||||||
|
while (s != null) {
|
||||||
|
String[] strings = s.split(" ");
|
||||||
|
Double value = Double.valueOf(strings[1]);
|
||||||
|
result.put(strings[2], value);
|
||||||
|
s = bufferedReader.readLine();
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
@ -28,7 +28,7 @@ import java.util.TreeMap;
|
|||||||
public class Heuristic {
|
public class Heuristic {
|
||||||
private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();
|
private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();
|
||||||
|
|
||||||
public void addEvristic(SuffixHeuristic suffixHeuristic) {
|
public void addHeuristic(SuffixHeuristic suffixHeuristic) {
|
||||||
Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
|
Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
|
||||||
Long longs = encodedSuffixesPairs.get(suffix);
|
Long longs = encodedSuffixesPairs.get(suffix);
|
||||||
if (longs == null) {
|
if (longs == null) {
|
||||||
|
@ -26,6 +26,11 @@ import java.util.Map;
|
|||||||
|
|
||||||
public class StatiticsCollectors implements WordProccessor {
|
public class StatiticsCollectors implements WordProccessor {
|
||||||
Map<SuffixHeuristic, SuffixCounter> statititics = new HashMap<SuffixHeuristic, SuffixCounter>();
|
Map<SuffixHeuristic, SuffixCounter> statititics = new HashMap<SuffixHeuristic, SuffixCounter>();
|
||||||
|
private Map<String, Double> wordsFreq;
|
||||||
|
|
||||||
|
public StatiticsCollectors(Map<String, Double> wordsFreq) {
|
||||||
|
this.wordsFreq = wordsFreq;
|
||||||
|
}
|
||||||
|
|
||||||
private Integer ignoredCount = 0;
|
private Integer ignoredCount = 0;
|
||||||
|
|
||||||
@ -38,7 +43,13 @@ public class StatiticsCollectors implements WordProccessor {
|
|||||||
suffixCounter = new SuffixCounter(suffixHeuristic);
|
suffixCounter = new SuffixCounter(suffixHeuristic);
|
||||||
statititics.put(suffixHeuristic, suffixCounter);
|
statititics.put(suffixHeuristic, suffixCounter);
|
||||||
}
|
}
|
||||||
suffixCounter.incrementAmount();
|
Double freq = wordsFreq.get(wordCard.getCanonicalFrom());
|
||||||
|
if (freq != null) {
|
||||||
|
suffixCounter.incrementAmount(1 + Math.log(freq));
|
||||||
|
} else {
|
||||||
|
suffixCounter.incrementAmount();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,7 +32,11 @@ public class SuffixCounter implements Comparable {
|
|||||||
amnout++;
|
amnout++;
|
||||||
}
|
}
|
||||||
|
|
||||||
public SuffixHeuristic getSuffixEvristic() {
|
public void incrementAmount(Double wordFreq) {
|
||||||
|
amnout += wordFreq;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SuffixHeuristic getSuffixHeuristic() {
|
||||||
return suffixHeuristic;
|
return suffixHeuristic;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -26,12 +26,12 @@ import java.io.InputStream;
|
|||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
|
|
||||||
|
|
||||||
public class SuffixEvristicsTest {
|
public class SuffixHeuristicTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testShouldDefineCorretCononicalWordForm() throws IOException {
|
public void testShouldDefineCorretCononicalWordForm() throws IOException {
|
||||||
SuffixHeuristic suffixHeuristic = new SuffixHeuristic();
|
SuffixHeuristic suffixHeuristic = new SuffixHeuristic();
|
||||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt");
|
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-heuristic-test-data.txt");
|
||||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||||
String s = bufferedReader.readLine();
|
String s = bufferedReader.readLine();
|
||||||
while (s != null) {
|
while (s != null) {
|
@ -4,4 +4,5 @@
|
|||||||
победы победа
|
победы победа
|
||||||
поэтическая поэтический
|
поэтическая поэтический
|
||||||
произошло произойти
|
произошло произойти
|
||||||
test test
|
test test
|
||||||
|
ананасов ананас
|
@ -9,7 +9,7 @@
|
|||||||
заход
|
заход
|
||||||
на
|
на
|
||||||
посадка
|
посадка
|
||||||
все
|
весь
|
||||||
нормальный
|
нормальный
|
||||||
быть
|
быть
|
||||||
рекомендовать
|
рекомендовать
|
||||||
@ -25,7 +25,7 @@
|
|||||||
новолазаревский
|
новолазаревский
|
||||||
антарктида
|
антарктида
|
||||||
совершаться
|
совершаться
|
||||||
примерный
|
примерно
|
||||||
один
|
один
|
||||||
раз
|
раз
|
||||||
в
|
в
|
||||||
|
Loading…
x
Reference in New Issue
Block a user