git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@41 d817d54c-26ab-11de-abc9-2f7d1455ff7a

This commit is contained in:
alexander.a.kuznetsov 2009-08-14 06:07:29 +00:00
parent d58c45a78c
commit dbec253529

View File

@ -18,7 +18,6 @@ package org.apache.lucene.russian.morphology.heuristic;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import org.apache.lucene.russian.morphology.dictonary.FlexiaModel; import org.apache.lucene.russian.morphology.dictonary.FlexiaModel;
import org.apache.lucene.russian.morphology.dictonary.GrammaReader;
import org.apache.lucene.russian.morphology.dictonary.WordCard; import org.apache.lucene.russian.morphology.dictonary.WordCard;
import org.apache.lucene.russian.morphology.dictonary.WordProccessor; import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
@ -29,11 +28,10 @@ import java.util.Map;
public class StatiticsCollectors implements WordProccessor { public class StatiticsCollectors implements WordProccessor {
Map<SimpleSuffixHeuristic, SuffixCounter> statititics = new HashMap<SimpleSuffixHeuristic, SuffixCounter>(); Map<SimpleSuffixHeuristic, SuffixCounter> statititics = new HashMap<SimpleSuffixHeuristic, SuffixCounter>();
private Map<String, Double> wordsFreq; private Map<String, Double> wordsFreq;
private GrammaReader grammaInfo;
public StatiticsCollectors(Map<String, Double> wordsFreq, GrammaReader grammaInfo) {
public StatiticsCollectors(Map<String, Double> wordsFreq) {
this.wordsFreq = wordsFreq; this.wordsFreq = wordsFreq;
this.grammaInfo = grammaInfo;
} }
private Integer ignoredCount = 0; private Integer ignoredCount = 0;
@ -66,9 +64,19 @@ public class StatiticsCollectors implements WordProccessor {
String form = fm.create(wordBase); String form = fm.create(wordBase);
int startSymbol = form.length() > RussianSuffixDecoderEncoder.suffixLength ? form.length() - RussianSuffixDecoderEncoder.suffixLength : 0; int startSymbol = form.length() > RussianSuffixDecoderEncoder.suffixLength ? form.length() - RussianSuffixDecoderEncoder.suffixLength : 0;
String formSuffix = form.substring(startSymbol); String formSuffix = form.substring(startSymbol);
String actualSuffix = fm.getSuffix(); String normalForm = wordBase + canonicalSuffix;
Integer actualSuffixLengh = actualSuffix.length(); Integer length = getCommonLength(form, normalForm);
return new SimpleSuffixHeuristic(formSuffix, actualSuffixLengh, canonicalSuffix, fm.getCode(), normalSuffixForm); Integer actualSuffixLengh = form.length() - length;
String actualNormalSuffix = normalForm.substring(length);
return new SimpleSuffixHeuristic(formSuffix, actualSuffixLengh, actualNormalSuffix, fm.getCode(), normalSuffixForm);
}
public static Integer getCommonLength(String s1, String s2) {
Integer length = Math.min(s1.length(), s2.length());
for (int i = 0; i < length; i++) {
if (s1.charAt(i) != s2.charAt(i)) return i;
}
return length;
} }