fixing problem with - in word the english morphology not work correctly because word forms contains it

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@97 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov
2010-05-29 15:54:44 +00:00
parent 3ca9eb2cb9
commit 7bf8ef7d6f
12 changed files with 6393 additions and 6381 deletions

View File

@ -71,11 +71,17 @@ public class DictonaryReader {
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
FlexiaModel flexiaModel = models.get(0);
if (models.size() > 0 && !ingnoredForm.contains(flexiaModel.getCode())) {
WordCard card = new WordCard(flexiaModel.create(wordBase), wordBase, flexiaModel.getSuffix());
for (FlexiaModel fm : models) {
card.addFlexia(fm);
}
wordProccessor.proccess(card);
// if(card.getBase().equals("face") || card.getBase().equals("fac")){
// System.out.println(models);
// System.out.println(card);
wordProccessor.process(card);
//}
}
}
}

View File

@ -17,7 +17,7 @@
package org.apache.lucene.morphology.dictionary;
/**
* Represent inofrmation of how word form created form it imutible part.
* Represent information of how word form created form it imutible part.
*/
public class FlexiaModel {
private String code;
@ -60,6 +60,10 @@ public class FlexiaModel {
@Override
public String toString() {
return prefix + " " + suffix;
return "FlexiaModel{" +
"code='" + code + '\'' +
", suffix='" + suffix + '\'' +
", prefix='" + prefix + '\'' +
'}';
}
}

View File

@ -26,20 +26,20 @@ import java.util.*;
//todo made refactoring this class
public class StatiticsCollector implements WordProccessor {
private TreeMap<String, Set<Heuristic>> inversIndex = new TreeMap<String, Set<Heuristic>>();
private Map<Set<Heuristic>, Integer> ruleInverIndex = new HashMap<Set<Heuristic>, Integer>();
public class StatisticsCollector implements WordProccessor {
private TreeMap<String, Set<Heuristic>> inverseIndex = new TreeMap<String, Set<Heuristic>>();
private Map<Set<Heuristic>, Integer> ruleInverseIndex = new HashMap<Set<Heuristic>, Integer>();
private List<Set<Heuristic>> rules = new ArrayList<Set<Heuristic>>();
private GrammaReader grammaReader;
private LetterDecoderEncoder decoderEncoder;
public StatiticsCollector(GrammaReader grammaReader, LetterDecoderEncoder decoderEncoder) {
public StatisticsCollector(GrammaReader grammaReader, LetterDecoderEncoder decoderEncoder) {
this.grammaReader = grammaReader;
this.decoderEncoder = decoderEncoder;
}
public void proccess(WordCard wordCard) throws IOException {
public void process(WordCard wordCard) throws IOException {
cleanWordCard(wordCard);
String normalStringMorph = wordCard.getWordsFroms().get(0).getCode();
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
@ -47,13 +47,13 @@ public class StatiticsCollector implements WordProccessor {
if (!decoderEncoder.checkString(word)) return;
for (FlexiaModel fm : wordCard.getWordsFroms()) {
if (!decoderEncoder.checkString(fm.create(wordCard.getBase()))) continue;
if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) continue;
Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph);
String form = revertWord(fm.create(wordCard.getBase()));
Set<Heuristic> suffixHeuristics = inversIndex.get(form);
Set<Heuristic> suffixHeuristics = inverseIndex.get(form);
if (suffixHeuristics == null) {
suffixHeuristics = new HashSet<Heuristic>();
inversIndex.put(form, suffixHeuristics);
inverseIndex.put(form, suffixHeuristics);
}
suffixHeuristics.add(heuristic);
}
@ -76,24 +76,24 @@ public class StatiticsCollector implements WordProccessor {
Map<Integer, Integer> dist = new TreeMap<Integer, Integer>();
Set<Heuristic> prevSet = null;
int count = 0;
for (String key : inversIndex.keySet()) {
Set<Heuristic> currentSet = inversIndex.get(key);
for (String key : inverseIndex.keySet()) {
Set<Heuristic> currentSet = inverseIndex.get(key);
if (!currentSet.equals(prevSet)) {
Integer d = dist.get(key.length());
dist.put(key.length(), 1 + (d == null ? 0 : d));
prevSet = currentSet;
count++;
if (!ruleInverIndex.containsKey(currentSet)) {
ruleInverIndex.put(currentSet, rules.size());
if (!ruleInverseIndex.containsKey(currentSet)) {
ruleInverseIndex.put(currentSet, rules.size());
rules.add(currentSet);
}
}
}
System.out.println("Word with diffirent rules " + count);
System.out.println("All ivers words " + inversIndex.size());
System.out.println("All ivers words " + inverseIndex.size());
System.out.println(dist);
System.out.println("diffirent rule count " + ruleInverIndex.size());
Heuristic[][] heuristics = new Heuristic[ruleInverIndex.size()][];
System.out.println("diffirent rule count " + ruleInverseIndex.size());
Heuristic[][] heuristics = new Heuristic[ruleInverseIndex.size()][];
int index = 0;
for (Set<Heuristic> hs : rules) {
heuristics[index] = new Heuristic[hs.size()];
@ -109,12 +109,12 @@ public class StatiticsCollector implements WordProccessor {
short[] rulesId = new short[count];
count = 0;
prevSet = null;
for (String key : inversIndex.keySet()) {
Set<Heuristic> currentSet = inversIndex.get(key);
for (String key : inverseIndex.keySet()) {
Set<Heuristic> currentSet = inverseIndex.get(key);
if (!currentSet.equals(prevSet)) {
int[] word = decoderEncoder.encodeToArray(key);
ints[count] = word;
rulesId[count] = (short) ruleInverIndex.get(currentSet).intValue();
rulesId[count] = (short) ruleInverseIndex.get(currentSet).intValue();
count++;
prevSet = currentSet;
}

View File

@ -69,4 +69,14 @@ public class WordCard {
public void setWordsFroms(List<FlexiaModel> wordsFroms) {
this.wordsFroms = wordsFroms;
}
@Override
public String toString() {
return "WordCard{" +
"canonicalFrom='" + canonicalFrom + '\'' +
", base='" + base + '\'' +
", canonicalSuffix='" + canonicalSuffix + '\'' +
", wordsFroms=" + wordsFroms +
'}';
}
}

View File

@ -24,5 +24,5 @@ import java.io.IOException;
*/
public interface WordProccessor {
public void proccess(WordCard wordCard) throws IOException;
public void process(WordCard wordCard) throws IOException;
}

View File

@ -18,7 +18,7 @@ package org.apache.lucene.morphology.generator;
import org.apache.lucene.morphology.dictionary.DictonaryReader;
import org.apache.lucene.morphology.dictionary.GrammaReader;
import org.apache.lucene.morphology.dictionary.StatiticsCollector;
import org.apache.lucene.morphology.dictionary.StatisticsCollector;
import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder;
import java.io.IOException;
@ -32,9 +32,9 @@ public class EnglishHeuristicBuilder {
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo, decoderEncoder);
dictonaryReader.proccess(statiticsCollector);
statiticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
dictonaryReader.proccess(statisticsCollector);
statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
}
}

View File

@ -18,7 +18,7 @@ package org.apache.lucene.morphology.generator;
import org.apache.lucene.morphology.dictionary.DictonaryReader;
import org.apache.lucene.morphology.dictionary.GrammaReader;
import org.apache.lucene.morphology.dictionary.StatiticsCollector;
import org.apache.lucene.morphology.dictionary.StatisticsCollector;
import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
import java.io.IOException;
@ -31,9 +31,9 @@ public class RussianHeuristicBuilder {
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo, decoderEncoder);
dictonaryReader.proccess(statiticsCollector);
statiticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
dictonaryReader.proccess(statisticsCollector);
statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");
}
}