fixed bug with statitics

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@39 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov 2009-08-11 21:16:02 +00:00
parent 70842ecfb7
commit 422f5c3e44
2 changed files with 44 additions and 29 deletions

View File

@ -18,12 +18,15 @@ package org.apache.lucene.russian.morphology;
import org.apache.lucene.russian.morphology.dictonary.*;
import org.apache.lucene.russian.morphology.heuristic.HeuristicBySuffixLegth;
import org.apache.lucene.russian.morphology.heuristic.SimpleSuffixHeuristic;
import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors;
import org.apache.lucene.russian.morphology.heuristic.SuffixCounter;
import org.apache.lucene.russian.morphology.heuristic.SimpleSuffixHeuristic;
import java.io.IOException;
import java.util.*;
import java.util.Arrays;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
@ -52,13 +55,25 @@ public class HeuristicBuilder {
heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic());
}
final Map<Long,Set<SimpleSuffixHeuristic>> map = heuristic.getUnkowns();
System.out.println("Single suffix " + heuristic.getSingleSuffixes().size());
System.out.println("diffiren morgh " + heuristic.getWordWithMorphology().size());
System.out.println("Ononims " + heuristic.getOnonyms().size());
final Map<Long, Set<SimpleSuffixHeuristic>> map = heuristic.getUnkowns();
System.out.println("Unknow suffix " + map.size());
int cont = 0;
for (Set<SimpleSuffixHeuristic> st : map.values()) {
if (cont > 20) break;
if (st.size() < 3) {
System.out.println(st);
cont++;
}
}
//final RussianSuffixDecoderEncoder decoderEncoder = new RussianSuffixDecoderEncoder(6);
final AtomicLong c = new AtomicLong(0L);
final AtomicLong all = new AtomicLong(0L);
dictonaryReader.proccess(
new WordProccessor(){
new WordProccessor() {
public void proccess(WordCard wordCard) throws IOException {
for (FlexiaModel fm : wordCard.getWordsFroms()) {
String form = fm.create(wordCard.getBase());
@ -66,7 +81,7 @@ public class HeuristicBuilder {
String formSuffix = form.substring(startSymbol);
Long aLong = RussianSuffixDecoderEncoder.encode(formSuffix);
all.incrementAndGet();
if(map.containsKey(aLong)) c.incrementAndGet();
if (map.containsKey(aLong)) c.incrementAndGet();
}
}
}

View File

@ -25,47 +25,47 @@ public class HeuristicBySuffixLegth {
return heuristics;
}
public Map<Long,SimpleSuffixHeuristic> getSingleSuffixes(){
public Map<Long, SimpleSuffixHeuristic> getSingleSuffixes() {
HashMap<Long, SimpleSuffixHeuristic> result = new HashMap<Long, SimpleSuffixHeuristic>();
for(Long st:heuristics.keySet()){
if(heuristics.get(st).size() == 1){
result.put(st,heuristics.get(st).iterator().next());
for (Long st : heuristics.keySet()) {
if (heuristics.get(st).size() == 1) {
result.put(st, heuristics.get(st).iterator().next());
}
}
return result;
}
public Map<Long,Set<SimpleSuffixHeuristic>> getWordWithMorphology(){
public Map<Long, Set<SimpleSuffixHeuristic>> getWordWithMorphology() {
HashMap<Long, Set<SimpleSuffixHeuristic>> result = new HashMap<Long, Set<SimpleSuffixHeuristic>>();
for(Long st:heuristics.keySet()){
if(heuristics.get(st).size() == 1) continue;
if(checkSetOnSuffix(heuristics.get(st))) {
result.put(st,heuristics.get(st));
for (Long st : heuristics.keySet()) {
if (heuristics.get(st).size() == 1) continue;
if (checkSetOnSuffix(heuristics.get(st))) {
result.put(st, heuristics.get(st));
}
}
return result;
}
public Map<Long,Set<SimpleSuffixHeuristic>> getOnonyms(){
public Map<Long, Set<SimpleSuffixHeuristic>> getOnonyms() {
HashMap<Long, Set<SimpleSuffixHeuristic>> result = new HashMap<Long, Set<SimpleSuffixHeuristic>>();
for(Long st:heuristics.keySet()){
if(heuristics.get(st).size() == 1) continue;
if(checkSetOnSuffix(heuristics.get(st))) continue;
if(heuristics.get(st).iterator().next().getFormSuffix().length() < 6){
result.put(st,heuristics.get(st));
for (Long st : heuristics.keySet()) {
if (heuristics.get(st).size() == 1) continue;
if (checkSetOnSuffix(heuristics.get(st))) continue;
if (heuristics.get(st).iterator().next().getFormSuffix().length() < 6) {
result.put(st, heuristics.get(st));
}
}
return result;
}
public Map<Long,Set<SimpleSuffixHeuristic>> getUnkowns(){
public Map<Long, Set<SimpleSuffixHeuristic>> getUnkowns() {
HashMap<Long, Set<SimpleSuffixHeuristic>> result = new HashMap<Long, Set<SimpleSuffixHeuristic>>();
for(Long st:heuristics.keySet()){
if(heuristics.get(st).size() == 1) continue;
if(checkSetOnSuffix(heuristics.get(st))) continue;
if(heuristics.get(st).iterator().next().getFormSuffix().length() >= 6){
result.put(st,heuristics.get(st));
for (Long st : heuristics.keySet()) {
if (heuristics.get(st).size() == 1) continue;
if (checkSetOnSuffix(heuristics.get(st))) continue;
if (heuristics.get(st).iterator().next().getFormSuffix().length() >= 6) {
result.put(st, heuristics.get(st));
}
}
return result;
@ -73,10 +73,10 @@ public class HeuristicBySuffixLegth {
private Boolean checkSetOnSuffix(Set<SimpleSuffixHeuristic> sshs) {
SimpleSuffixHeuristic heuristic = sshs.iterator().next();
String normalSuffix = heuristic.getFormSuffix();
String normalSuffix = heuristic.getNormalSuffix();
Integer suffixLenght = heuristic.getActualSuffixLength();
Boolean result = true;
for(SimpleSuffixHeuristic ssh:sshs){
for (SimpleSuffixHeuristic ssh : sshs) {
result = result && ssh.getActualSuffixLength().equals(suffixLenght) && ssh.getNormalSuffix().endsWith(normalSuffix);
}
return result;