working on new model for russian morphology

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@42 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov 2009-08-14 12:59:05 +00:00
parent dbec253529
commit f2856e5696
2 changed files with 169 additions and 49 deletions

View File

@ -39,56 +39,63 @@ public class HeuristicBuilder {
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form);
StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read(), grammaInfo);
dictonaryReader.proccess(statiticsCollectors);
Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
Object[] objects = counterCollection.toArray();
Arrays.sort(objects);
System.out.println("Length " + objects.length + " ingored words " + statiticsCollectors.getIgnoredCount());
for (int i = 0; i < 10; i++) {
System.out.println(objects[i]);
}
final HeuristicBySuffixLegth heuristic = new HeuristicBySuffixLegth();
for (int i = 0; i < objects.length; i++) {
heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic());
}
System.out.println("Single suffix " + heuristic.getSingleSuffixes().size());
System.out.println("diffiren morgh " + heuristic.getWordWithMorphology().size());
System.out.println("Ononims " + heuristic.getOnonyms().size());
final Map<Long, Set<SimpleSuffixHeuristic>> map = heuristic.getUnkowns();
System.out.println("Unknow suffix " + map.size());
int cont = 0;
for (Set<SimpleSuffixHeuristic> st : map.values()) {
if (cont > 50) break;
if (st.size() < 3) {
System.out.println(st);
cont++;
}
}
//final RussianSuffixDecoderEncoder decoderEncoder = new RussianSuffixDecoderEncoder(6);
final AtomicLong c = new AtomicLong(0L);
final AtomicLong all = new AtomicLong(0L);
dictonaryReader.proccess(
new WordProccessor() {
public void proccess(WordCard wordCard) throws IOException {
for (FlexiaModel fm : wordCard.getWordsFroms()) {
String form = fm.create(wordCard.getBase());
int startSymbol = form.length() > RussianSuffixDecoderEncoder.suffixLength ? form.length() - RussianSuffixDecoderEncoder.suffixLength : 0;
String formSuffix = form.substring(startSymbol);
Long aLong = RussianSuffixDecoderEncoder.encode(formSuffix);
all.incrementAndGet();
if (map.containsKey(aLong)) c.incrementAndGet();
}
}
}
);
NewModel newModel = new NewModel();
dictonaryReader.proccess(newModel);
newModel.printInfo();
System.out.println("Ankown words " + all.longValue());
System.out.println("Ankown words " + c.longValue());
// StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read());
// dictonaryReader.proccess(statiticsCollectors);
// Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
// Object[] objects = counterCollection.toArray();
// Arrays.sort(objects);
// System.out.println("Length " + objects.length + " ingored words " + statiticsCollectors.getIgnoredCount());
// for (int i = 0; i < 10; i++) {
// System.out.println(objects[i]);
// }
//
// final HeuristicBySuffixLegth heuristic = new HeuristicBySuffixLegth();
// for (int i = 0; i < objects.length; i++) {
// heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic());
// }
//
// System.out.println("Single suffix " + heuristic.getSingleSuffixes().size());
// System.out.println("diffiren morgh " + heuristic.getWordWithMorphology().size());
// System.out.println("Ononims " + heuristic.getOnonyms().size());
// final Map<Long, Set<SimpleSuffixHeuristic>> map = heuristic.getUnkowns();
// System.out.println("Unknow suffix " + map.size());
// int cont = 0;
// for (Set<SimpleSuffixHeuristic> st : map.values()) {
//
// if (cont > 50) break;
// if (st.size() < 3) {
// System.out.println(st);
// cont++;
// }
// }
// //final RussianSuffixDecoderEncoder decoderEncoder = new RussianSuffixDecoderEncoder(6);
// final AtomicLong c = new AtomicLong(0L);
// final AtomicLong all = new AtomicLong(0L);
// dictonaryReader.proccess(
// new WordProccessor() {
// public void proccess(WordCard wordCard) throws IOException {
// for (FlexiaModel fm : wordCard.getWordsFroms()) {
// String form = fm.create(wordCard.getBase());
// if(form.startsWith("ïðèê") && form.endsWith("üÿ")) System.out.println(form);
//
//
// int startSymbol = form.length() > RussianSuffixDecoderEncoder.suffixLength ? form.length() - RussianSuffixDecoderEncoder.suffixLength : 0;
// String formSuffix = form.substring(startSymbol);
// Long aLong = RussianSuffixDecoderEncoder.encode(formSuffix);
// all.incrementAndGet();
// if (map.containsKey(aLong)) c.incrementAndGet();
// }
// }
// }
// );
//
//
// System.out.println("Ankown words " + all.longValue());
// System.out.println("Ankown words " + c.longValue());
}
}

View File

@ -0,0 +1,113 @@
package org.apache.lucene.russian.morphology;
import org.apache.lucene.russian.morphology.heuristic.SimpleSuffixHeuristic;
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
import org.apache.lucene.russian.morphology.dictonary.WordCard;
import org.apache.lucene.russian.morphology.dictonary.FlexiaModel;
import java.util.TreeMap;
import java.util.Set;
import java.util.HashSet;
import java.io.IOException;
public class NewModel implements WordProccessor{
private TreeMap<String, Set<Heuristic>> inversIndex = new TreeMap<String,Set<Heuristic>>();
public void proccess(WordCard wordCard) throws IOException {
String normalStringMorph = wordCard.getWordsFroms().get(0).getCode();
for (FlexiaModel fm : wordCard.getWordsFroms()) {
Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph);
String form = revertWord(fm.create(wordCard.getBase()));
Set<Heuristic> suffixHeuristics = inversIndex.get(form);
if(suffixHeuristics == null){
suffixHeuristics = new HashSet<Heuristic>();
inversIndex.put(form,suffixHeuristics);
}
suffixHeuristics.add(heuristic);
}
}
public void printInfo(){
System.out.println("All ivers words " + inversIndex.size());
Set<Heuristic> prevSet = null;
int count = 0;
for(Set<Heuristic> currentSet:inversIndex.values()){
if(!currentSet.equals(prevSet)){
prevSet = currentSet;
count++;
}
}
System.out.println("Word with diffirent rules " + count);
}
private String revertWord(String s){
String result = "";
for (int i = 1; i <= s.length(); i++) {
result += s.charAt(s.length() - i);
}
return result;
}
private Heuristic createEvristic(String wordBase, String canonicalSuffix, FlexiaModel fm, String normalSuffixForm) {
String form = fm.create(wordBase);
String normalForm = wordBase + canonicalSuffix;
Integer length = getCommonLength(form, normalForm);
Integer actualSuffixLengh = form.length() - length;
String actualNormalSuffix = normalForm.substring(length);
return new Heuristic(actualSuffixLengh, actualNormalSuffix, fm.getCode(), normalSuffixForm);
}
public static Integer getCommonLength(String s1, String s2) {
Integer length = Math.min(s1.length(), s2.length());
for (int i = 0; i < length; i++) {
if (s1.charAt(i) != s2.charAt(i)) return i;
}
return length;
}
private class Heuristic{
Integer actualSuffixLengh;
String actualNormalSuffix;
String formMorphInfo;
String normalSuffixForm;
private Heuristic(Integer actualSuffixLengh, String actualNormalSuffix, String formMorphInfo, String normalSuffixForm) {
this.actualSuffixLengh = actualSuffixLengh;
this.actualNormalSuffix = actualNormalSuffix;
this.formMorphInfo = formMorphInfo;
this.normalSuffixForm = normalSuffixForm;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Heuristic heuristic = (Heuristic) o;
if (actualNormalSuffix != null ? !actualNormalSuffix.equals(heuristic.actualNormalSuffix) : heuristic.actualNormalSuffix != null)
return false;
if (actualSuffixLengh != null ? !actualSuffixLengh.equals(heuristic.actualSuffixLengh) : heuristic.actualSuffixLengh != null)
return false;
if (formMorphInfo != null ? !formMorphInfo.equals(heuristic.formMorphInfo) : heuristic.formMorphInfo != null)
return false;
if (normalSuffixForm != null ? !normalSuffixForm.equals(heuristic.normalSuffixForm) : heuristic.normalSuffixForm != null)
return false;
return true;
}
@Override
public int hashCode() {
int result = actualSuffixLengh != null ? actualSuffixLengh.hashCode() : 0;
result = 31 * result + (actualNormalSuffix != null ? actualNormalSuffix.hashCode() : 0);
result = 31 * result + (formMorphInfo != null ? formMorphInfo.hashCode() : 0);
result = 31 * result + (normalSuffixForm != null ? normalSuffixForm.hashCode() : 0);
return result;
}
}
}