fixed bug with statitics

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@39 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov 2009-08-11 21:16:02 +00:00
parent 70842ecfb7
commit 422f5c3e44
2 changed files with 44 additions and 29 deletions

View File

@ -18,12 +18,15 @@ package org.apache.lucene.russian.morphology;
import org.apache.lucene.russian.morphology.dictonary.*; import org.apache.lucene.russian.morphology.dictonary.*;
import org.apache.lucene.russian.morphology.heuristic.HeuristicBySuffixLegth; import org.apache.lucene.russian.morphology.heuristic.HeuristicBySuffixLegth;
import org.apache.lucene.russian.morphology.heuristic.SimpleSuffixHeuristic;
import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors; import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors;
import org.apache.lucene.russian.morphology.heuristic.SuffixCounter; import org.apache.lucene.russian.morphology.heuristic.SuffixCounter;
import org.apache.lucene.russian.morphology.heuristic.SimpleSuffixHeuristic;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.Arrays;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicLong;
@ -52,13 +55,25 @@ public class HeuristicBuilder {
heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic()); heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic());
} }
final Map<Long,Set<SimpleSuffixHeuristic>> map = heuristic.getUnkowns(); System.out.println("Single suffix " + heuristic.getSingleSuffixes().size());
System.out.println("diffiren morgh " + heuristic.getWordWithMorphology().size());
System.out.println("Ononims " + heuristic.getOnonyms().size());
final Map<Long, Set<SimpleSuffixHeuristic>> map = heuristic.getUnkowns();
System.out.println("Unknow suffix " + map.size());
int cont = 0;
for (Set<SimpleSuffixHeuristic> st : map.values()) {
if (cont > 20) break;
if (st.size() < 3) {
System.out.println(st);
cont++;
}
}
//final RussianSuffixDecoderEncoder decoderEncoder = new RussianSuffixDecoderEncoder(6); //final RussianSuffixDecoderEncoder decoderEncoder = new RussianSuffixDecoderEncoder(6);
final AtomicLong c = new AtomicLong(0L); final AtomicLong c = new AtomicLong(0L);
final AtomicLong all = new AtomicLong(0L); final AtomicLong all = new AtomicLong(0L);
dictonaryReader.proccess( dictonaryReader.proccess(
new WordProccessor(){ new WordProccessor() {
public void proccess(WordCard wordCard) throws IOException { public void proccess(WordCard wordCard) throws IOException {
for (FlexiaModel fm : wordCard.getWordsFroms()) { for (FlexiaModel fm : wordCard.getWordsFroms()) {
String form = fm.create(wordCard.getBase()); String form = fm.create(wordCard.getBase());
@ -66,7 +81,7 @@ public class HeuristicBuilder {
String formSuffix = form.substring(startSymbol); String formSuffix = form.substring(startSymbol);
Long aLong = RussianSuffixDecoderEncoder.encode(formSuffix); Long aLong = RussianSuffixDecoderEncoder.encode(formSuffix);
all.incrementAndGet(); all.incrementAndGet();
if(map.containsKey(aLong)) c.incrementAndGet(); if (map.containsKey(aLong)) c.incrementAndGet();
} }
} }
} }

View File

@ -25,47 +25,47 @@ public class HeuristicBySuffixLegth {
return heuristics; return heuristics;
} }
public Map<Long,SimpleSuffixHeuristic> getSingleSuffixes(){ public Map<Long, SimpleSuffixHeuristic> getSingleSuffixes() {
HashMap<Long, SimpleSuffixHeuristic> result = new HashMap<Long, SimpleSuffixHeuristic>(); HashMap<Long, SimpleSuffixHeuristic> result = new HashMap<Long, SimpleSuffixHeuristic>();
for(Long st:heuristics.keySet()){ for (Long st : heuristics.keySet()) {
if(heuristics.get(st).size() == 1){ if (heuristics.get(st).size() == 1) {
result.put(st,heuristics.get(st).iterator().next()); result.put(st, heuristics.get(st).iterator().next());
} }
} }
return result; return result;
} }
public Map<Long,Set<SimpleSuffixHeuristic>> getWordWithMorphology(){ public Map<Long, Set<SimpleSuffixHeuristic>> getWordWithMorphology() {
HashMap<Long, Set<SimpleSuffixHeuristic>> result = new HashMap<Long, Set<SimpleSuffixHeuristic>>(); HashMap<Long, Set<SimpleSuffixHeuristic>> result = new HashMap<Long, Set<SimpleSuffixHeuristic>>();
for(Long st:heuristics.keySet()){ for (Long st : heuristics.keySet()) {
if(heuristics.get(st).size() == 1) continue; if (heuristics.get(st).size() == 1) continue;
if(checkSetOnSuffix(heuristics.get(st))) { if (checkSetOnSuffix(heuristics.get(st))) {
result.put(st,heuristics.get(st)); result.put(st, heuristics.get(st));
} }
} }
return result; return result;
} }
public Map<Long,Set<SimpleSuffixHeuristic>> getOnonyms(){ public Map<Long, Set<SimpleSuffixHeuristic>> getOnonyms() {
HashMap<Long, Set<SimpleSuffixHeuristic>> result = new HashMap<Long, Set<SimpleSuffixHeuristic>>(); HashMap<Long, Set<SimpleSuffixHeuristic>> result = new HashMap<Long, Set<SimpleSuffixHeuristic>>();
for(Long st:heuristics.keySet()){ for (Long st : heuristics.keySet()) {
if(heuristics.get(st).size() == 1) continue; if (heuristics.get(st).size() == 1) continue;
if(checkSetOnSuffix(heuristics.get(st))) continue; if (checkSetOnSuffix(heuristics.get(st))) continue;
if(heuristics.get(st).iterator().next().getFormSuffix().length() < 6){ if (heuristics.get(st).iterator().next().getFormSuffix().length() < 6) {
result.put(st,heuristics.get(st)); result.put(st, heuristics.get(st));
} }
} }
return result; return result;
} }
public Map<Long,Set<SimpleSuffixHeuristic>> getUnkowns(){ public Map<Long, Set<SimpleSuffixHeuristic>> getUnkowns() {
HashMap<Long, Set<SimpleSuffixHeuristic>> result = new HashMap<Long, Set<SimpleSuffixHeuristic>>(); HashMap<Long, Set<SimpleSuffixHeuristic>> result = new HashMap<Long, Set<SimpleSuffixHeuristic>>();
for(Long st:heuristics.keySet()){ for (Long st : heuristics.keySet()) {
if(heuristics.get(st).size() == 1) continue; if (heuristics.get(st).size() == 1) continue;
if(checkSetOnSuffix(heuristics.get(st))) continue; if (checkSetOnSuffix(heuristics.get(st))) continue;
if(heuristics.get(st).iterator().next().getFormSuffix().length() >= 6){ if (heuristics.get(st).iterator().next().getFormSuffix().length() >= 6) {
result.put(st,heuristics.get(st)); result.put(st, heuristics.get(st));
} }
} }
return result; return result;
@ -73,10 +73,10 @@ public class HeuristicBySuffixLegth {
private Boolean checkSetOnSuffix(Set<SimpleSuffixHeuristic> sshs) { private Boolean checkSetOnSuffix(Set<SimpleSuffixHeuristic> sshs) {
SimpleSuffixHeuristic heuristic = sshs.iterator().next(); SimpleSuffixHeuristic heuristic = sshs.iterator().next();
String normalSuffix = heuristic.getFormSuffix(); String normalSuffix = heuristic.getNormalSuffix();
Integer suffixLenght = heuristic.getActualSuffixLength(); Integer suffixLenght = heuristic.getActualSuffixLength();
Boolean result = true; Boolean result = true;
for(SimpleSuffixHeuristic ssh:sshs){ for (SimpleSuffixHeuristic ssh : sshs) {
result = result && ssh.getActualSuffixLength().equals(suffixLenght) && ssh.getNormalSuffix().endsWith(normalSuffix); result = result && ssh.getActualSuffixLength().equals(suffixLenght) && ssh.getNormalSuffix().endsWith(normalSuffix);
} }
return result; return result;