diff --git a/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java b/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java index 94e4c2b..5bae4b8 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java +++ b/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java @@ -39,56 +39,63 @@ public class HeuristicBuilder { GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab"); DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form); - - StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read(), grammaInfo); - dictonaryReader.proccess(statiticsCollectors); - Collection counterCollection = statiticsCollectors.getStatititics().values(); - Object[] objects = counterCollection.toArray(); - Arrays.sort(objects); - System.out.println("Length " + objects.length + " ingored words " + statiticsCollectors.getIgnoredCount()); - for (int i = 0; i < 10; i++) { - System.out.println(objects[i]); - } - - final HeuristicBySuffixLegth heuristic = new HeuristicBySuffixLegth(); - for (int i = 0; i < objects.length; i++) { - heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic()); - } - - System.out.println("Single suffix " + heuristic.getSingleSuffixes().size()); - System.out.println("diffiren morgh " + heuristic.getWordWithMorphology().size()); - System.out.println("Ononims " + heuristic.getOnonyms().size()); - final Map> map = heuristic.getUnkowns(); - System.out.println("Unknow suffix " + map.size()); - int cont = 0; - for (Set st : map.values()) { - - if (cont > 50) break; - if (st.size() < 3) { - System.out.println(st); - cont++; - } - } - //final RussianSuffixDecoderEncoder decoderEncoder = new RussianSuffixDecoderEncoder(6); - final AtomicLong c = new AtomicLong(0L); - final AtomicLong all = new AtomicLong(0L); - dictonaryReader.proccess( - new WordProccessor() { - public void proccess(WordCard wordCard) throws IOException { - for (FlexiaModel fm : wordCard.getWordsFroms()) { - String form = fm.create(wordCard.getBase()); - int startSymbol = form.length() > RussianSuffixDecoderEncoder.suffixLength ? form.length() - RussianSuffixDecoderEncoder.suffixLength : 0; - String formSuffix = form.substring(startSymbol); - Long aLong = RussianSuffixDecoderEncoder.encode(formSuffix); - all.incrementAndGet(); - if (map.containsKey(aLong)) c.incrementAndGet(); - } - } - } - ); + NewModel newModel = new NewModel(); + dictonaryReader.proccess(newModel); + newModel.printInfo(); - System.out.println("Ankown words " + all.longValue()); - System.out.println("Ankown words " + c.longValue()); +// StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read()); +// dictonaryReader.proccess(statiticsCollectors); +// Collection counterCollection = statiticsCollectors.getStatititics().values(); +// Object[] objects = counterCollection.toArray(); +// Arrays.sort(objects); +// System.out.println("Length " + objects.length + " ingored words " + statiticsCollectors.getIgnoredCount()); +// for (int i = 0; i < 10; i++) { +// System.out.println(objects[i]); +// } +// +// final HeuristicBySuffixLegth heuristic = new HeuristicBySuffixLegth(); +// for (int i = 0; i < objects.length; i++) { +// heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic()); +// } +// +// System.out.println("Single suffix " + heuristic.getSingleSuffixes().size()); +// System.out.println("diffiren morgh " + heuristic.getWordWithMorphology().size()); +// System.out.println("Ononims " + heuristic.getOnonyms().size()); +// final Map> map = heuristic.getUnkowns(); +// System.out.println("Unknow suffix " + map.size()); +// int cont = 0; +// for (Set st : map.values()) { +// +// if (cont > 50) break; +// if (st.size() < 3) { +// System.out.println(st); +// cont++; +// } +// } +// //final RussianSuffixDecoderEncoder decoderEncoder = new RussianSuffixDecoderEncoder(6); +// final AtomicLong c = new AtomicLong(0L); +// final AtomicLong all = new AtomicLong(0L); +// dictonaryReader.proccess( +// new WordProccessor() { +// public void proccess(WordCard wordCard) throws IOException { +// for (FlexiaModel fm : wordCard.getWordsFroms()) { +// String form = fm.create(wordCard.getBase()); +// if(form.startsWith("прик") && form.endsWith("ья")) System.out.println(form); +// +// +// int startSymbol = form.length() > RussianSuffixDecoderEncoder.suffixLength ? form.length() - RussianSuffixDecoderEncoder.suffixLength : 0; +// String formSuffix = form.substring(startSymbol); +// Long aLong = RussianSuffixDecoderEncoder.encode(formSuffix); +// all.incrementAndGet(); +// if (map.containsKey(aLong)) c.incrementAndGet(); +// } +// } +// } +// ); +// +// +// System.out.println("Ankown words " + all.longValue()); +// System.out.println("Ankown words " + c.longValue()); } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/NewModel.java b/src/main/java/org/apache/lucene/russian/morphology/NewModel.java new file mode 100644 index 0000000..cd8e991 --- /dev/null +++ b/src/main/java/org/apache/lucene/russian/morphology/NewModel.java @@ -0,0 +1,113 @@ +package org.apache.lucene.russian.morphology; + +import org.apache.lucene.russian.morphology.heuristic.SimpleSuffixHeuristic; +import org.apache.lucene.russian.morphology.dictonary.WordProccessor; +import org.apache.lucene.russian.morphology.dictonary.WordCard; +import org.apache.lucene.russian.morphology.dictonary.FlexiaModel; + +import java.util.TreeMap; +import java.util.Set; +import java.util.HashSet; +import java.io.IOException; + + +public class NewModel implements WordProccessor{ + private TreeMap> inversIndex = new TreeMap>(); + + public void proccess(WordCard wordCard) throws IOException { + String normalStringMorph = wordCard.getWordsFroms().get(0).getCode(); + for (FlexiaModel fm : wordCard.getWordsFroms()) { + Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph); + String form = revertWord(fm.create(wordCard.getBase())); + Set suffixHeuristics = inversIndex.get(form); + if(suffixHeuristics == null){ + suffixHeuristics = new HashSet(); + inversIndex.put(form,suffixHeuristics); + } + suffixHeuristics.add(heuristic); + } + } + + + public void printInfo(){ + System.out.println("All ivers words " + inversIndex.size()); + Set prevSet = null; + int count = 0; + for(Set currentSet:inversIndex.values()){ + if(!currentSet.equals(prevSet)){ + prevSet = currentSet; + count++; + } + } + System.out.println("Word with diffirent rules " + count); + } + + private String revertWord(String s){ + String result = ""; + for (int i = 1; i <= s.length(); i++) { + result += s.charAt(s.length() - i); + } + return result; + } + + + private Heuristic createEvristic(String wordBase, String canonicalSuffix, FlexiaModel fm, String normalSuffixForm) { + String form = fm.create(wordBase); + String normalForm = wordBase + canonicalSuffix; + Integer length = getCommonLength(form, normalForm); + Integer actualSuffixLengh = form.length() - length; + String actualNormalSuffix = normalForm.substring(length); + return new Heuristic(actualSuffixLengh, actualNormalSuffix, fm.getCode(), normalSuffixForm); + } + + public static Integer getCommonLength(String s1, String s2) { + Integer length = Math.min(s1.length(), s2.length()); + for (int i = 0; i < length; i++) { + if (s1.charAt(i) != s2.charAt(i)) return i; + } + return length; + } + + + private class Heuristic{ + Integer actualSuffixLengh; + String actualNormalSuffix; + String formMorphInfo; + String normalSuffixForm; + + private Heuristic(Integer actualSuffixLengh, String actualNormalSuffix, String formMorphInfo, String normalSuffixForm) { + this.actualSuffixLengh = actualSuffixLengh; + this.actualNormalSuffix = actualNormalSuffix; + this.formMorphInfo = formMorphInfo; + this.normalSuffixForm = normalSuffixForm; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Heuristic heuristic = (Heuristic) o; + + if (actualNormalSuffix != null ? !actualNormalSuffix.equals(heuristic.actualNormalSuffix) : heuristic.actualNormalSuffix != null) + return false; + if (actualSuffixLengh != null ? !actualSuffixLengh.equals(heuristic.actualSuffixLengh) : heuristic.actualSuffixLengh != null) + return false; + if (formMorphInfo != null ? !formMorphInfo.equals(heuristic.formMorphInfo) : heuristic.formMorphInfo != null) + return false; + if (normalSuffixForm != null ? !normalSuffixForm.equals(heuristic.normalSuffixForm) : heuristic.normalSuffixForm != null) + return false; + + return true; + } + + @Override + public int hashCode() { + int result = actualSuffixLengh != null ? actualSuffixLengh.hashCode() : 0; + result = 31 * result + (actualNormalSuffix != null ? actualNormalSuffix.hashCode() : 0); + result = 31 * result + (formMorphInfo != null ? formMorphInfo.hashCode() : 0); + result = 31 * result + (normalSuffixForm != null ? normalSuffixForm.hashCode() : 0); + return result; + } + } +}