diff --git a/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java b/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java index 542a5a9..ab7c32a 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java +++ b/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java @@ -18,12 +18,15 @@ package org.apache.lucene.russian.morphology; import org.apache.lucene.russian.morphology.dictonary.*; import org.apache.lucene.russian.morphology.heuristic.HeuristicBySuffixLegth; +import org.apache.lucene.russian.morphology.heuristic.SimpleSuffixHeuristic; import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors; import org.apache.lucene.russian.morphology.heuristic.SuffixCounter; -import org.apache.lucene.russian.morphology.heuristic.SimpleSuffixHeuristic; import java.io.IOException; -import java.util.*; +import java.util.Arrays; +import java.util.Collection; +import java.util.Map; +import java.util.Set; import java.util.concurrent.atomic.AtomicLong; @@ -52,13 +55,25 @@ public class HeuristicBuilder { heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic()); } - final Map> map = heuristic.getUnkowns(); + System.out.println("Single suffix " + heuristic.getSingleSuffixes().size()); + System.out.println("diffiren morgh " + heuristic.getWordWithMorphology().size()); + System.out.println("Ononims " + heuristic.getOnonyms().size()); + final Map> map = heuristic.getUnkowns(); + System.out.println("Unknow suffix " + map.size()); + int cont = 0; + for (Set st : map.values()) { + if (cont > 20) break; + if (st.size() < 3) { + System.out.println(st); + cont++; + } + } //final RussianSuffixDecoderEncoder decoderEncoder = new RussianSuffixDecoderEncoder(6); final AtomicLong c = new AtomicLong(0L); - final AtomicLong all = new AtomicLong(0L); + final AtomicLong all = new AtomicLong(0L); dictonaryReader.proccess( - new WordProccessor(){ + new WordProccessor() { public void proccess(WordCard wordCard) throws IOException { for (FlexiaModel fm : wordCard.getWordsFroms()) { String form = fm.create(wordCard.getBase()); @@ -66,7 +81,7 @@ public class HeuristicBuilder { String formSuffix = form.substring(startSymbol); Long aLong = RussianSuffixDecoderEncoder.encode(formSuffix); all.incrementAndGet(); - if(map.containsKey(aLong)) c.incrementAndGet(); + if (map.containsKey(aLong)) c.incrementAndGet(); } } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/heuristic/HeuristicBySuffixLegth.java b/src/main/java/org/apache/lucene/russian/morphology/heuristic/HeuristicBySuffixLegth.java index 4432e3b..25caedb 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/HeuristicBySuffixLegth.java +++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/HeuristicBySuffixLegth.java @@ -25,47 +25,47 @@ public class HeuristicBySuffixLegth { return heuristics; } - public Map getSingleSuffixes(){ + public Map getSingleSuffixes() { HashMap result = new HashMap(); - for(Long st:heuristics.keySet()){ - if(heuristics.get(st).size() == 1){ - result.put(st,heuristics.get(st).iterator().next()); + for (Long st : heuristics.keySet()) { + if (heuristics.get(st).size() == 1) { + result.put(st, heuristics.get(st).iterator().next()); } } return result; } - public Map> getWordWithMorphology(){ + public Map> getWordWithMorphology() { HashMap> result = new HashMap>(); - for(Long st:heuristics.keySet()){ - if(heuristics.get(st).size() == 1) continue; - if(checkSetOnSuffix(heuristics.get(st))) { - result.put(st,heuristics.get(st)); + for (Long st : heuristics.keySet()) { + if (heuristics.get(st).size() == 1) continue; + if (checkSetOnSuffix(heuristics.get(st))) { + result.put(st, heuristics.get(st)); } } return result; } - public Map> getOnonyms(){ + public Map> getOnonyms() { HashMap> result = new HashMap>(); - for(Long st:heuristics.keySet()){ - if(heuristics.get(st).size() == 1) continue; - if(checkSetOnSuffix(heuristics.get(st))) continue; - if(heuristics.get(st).iterator().next().getFormSuffix().length() < 6){ - result.put(st,heuristics.get(st)); + for (Long st : heuristics.keySet()) { + if (heuristics.get(st).size() == 1) continue; + if (checkSetOnSuffix(heuristics.get(st))) continue; + if (heuristics.get(st).iterator().next().getFormSuffix().length() < 6) { + result.put(st, heuristics.get(st)); } } return result; } - public Map> getUnkowns(){ + public Map> getUnkowns() { HashMap> result = new HashMap>(); - for(Long st:heuristics.keySet()){ - if(heuristics.get(st).size() == 1) continue; - if(checkSetOnSuffix(heuristics.get(st))) continue; - if(heuristics.get(st).iterator().next().getFormSuffix().length() >= 6){ - result.put(st,heuristics.get(st)); + for (Long st : heuristics.keySet()) { + if (heuristics.get(st).size() == 1) continue; + if (checkSetOnSuffix(heuristics.get(st))) continue; + if (heuristics.get(st).iterator().next().getFormSuffix().length() >= 6) { + result.put(st, heuristics.get(st)); } } return result; @@ -73,10 +73,10 @@ public class HeuristicBySuffixLegth { private Boolean checkSetOnSuffix(Set sshs) { SimpleSuffixHeuristic heuristic = sshs.iterator().next(); - String normalSuffix = heuristic.getFormSuffix(); + String normalSuffix = heuristic.getNormalSuffix(); Integer suffixLenght = heuristic.getActualSuffixLength(); Boolean result = true; - for(SimpleSuffixHeuristic ssh:sshs){ + for (SimpleSuffixHeuristic ssh : sshs) { result = result && ssh.getActualSuffixLength().equals(suffixLenght) && ssh.getNormalSuffix().endsWith(normalSuffix); } return result;