diff --git a/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java b/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java index ab7c32a..94e4c2b 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java +++ b/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java @@ -63,7 +63,7 @@ public class HeuristicBuilder { int cont = 0; for (Set st : map.values()) { - if (cont > 20) break; + if (cont > 50) break; if (st.size() < 3) { System.out.println(st); cont++; diff --git a/src/main/java/org/apache/lucene/russian/morphology/heuristic/HeuristicBySuffixLegth.java b/src/main/java/org/apache/lucene/russian/morphology/heuristic/HeuristicBySuffixLegth.java index 25caedb..b2175f8 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/HeuristicBySuffixLegth.java +++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/HeuristicBySuffixLegth.java @@ -75,9 +75,13 @@ public class HeuristicBySuffixLegth { SimpleSuffixHeuristic heuristic = sshs.iterator().next(); String normalSuffix = heuristic.getNormalSuffix(); Integer suffixLenght = heuristic.getActualSuffixLength(); + String normalFormMorphInfo = heuristic.getNormalFormMorphInfo(); Boolean result = true; for (SimpleSuffixHeuristic ssh : sshs) { - result = result && ssh.getActualSuffixLength().equals(suffixLenght) && ssh.getNormalSuffix().endsWith(normalSuffix); + result = result && + ssh.getActualSuffixLength().equals(suffixLenght) && + ssh.getNormalSuffix().equals(normalSuffix) && + ssh.getNormalFormMorphInfo().equals(normalFormMorphInfo); } return result; } diff --git a/src/main/java/org/apache/lucene/russian/morphology/heuristic/SimpleSuffixHeuristic.java b/src/main/java/org/apache/lucene/russian/morphology/heuristic/SimpleSuffixHeuristic.java index 95c6b70..815299a 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/SimpleSuffixHeuristic.java +++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/SimpleSuffixHeuristic.java @@ -27,12 +27,14 @@ public class SimpleSuffixHeuristic { private Integer actualSuffixLength; private String normalSuffix; private String morphInfoCode; + private String normalFormMorphInfo; - public SimpleSuffixHeuristic(String formSuffix, Integer actualSuffixLength, String normalSuffix, String morphInfoCode) { + public SimpleSuffixHeuristic(String formSuffix, Integer actualSuffixLength, String normalSuffix, String morphInfoCode, String normalFormMorphInfo) { this.formSuffix = formSuffix; this.actualSuffixLength = actualSuffixLength; this.normalSuffix = normalSuffix; this.morphInfoCode = morphInfoCode; + this.normalFormMorphInfo = normalFormMorphInfo; } public String getFormSuffix() { @@ -51,6 +53,14 @@ public class SimpleSuffixHeuristic { return morphInfoCode; } + public String getNormalFormMorphInfo() { + return normalFormMorphInfo; + } + + public void setNormalFormMorphInfo(String normalFormMorphInfo) { + this.normalFormMorphInfo = normalFormMorphInfo; + } + @Override public boolean equals(Object o) { if (this == o) return true; @@ -63,7 +73,8 @@ public class SimpleSuffixHeuristic { if (formSuffix != null ? !formSuffix.equals(that.formSuffix) : that.formSuffix != null) return false; if (morphInfoCode != null ? !morphInfoCode.equals(that.morphInfoCode) : that.morphInfoCode != null) return false; - if (normalSuffix != null ? !normalSuffix.equals(that.normalSuffix) : that.normalSuffix != null) + if (normalSuffix != null ? !normalSuffix.equals(that.normalSuffix) : that.normalSuffix != null) return false; + if (normalFormMorphInfo != null ? !normalFormMorphInfo.equals(that.normalFormMorphInfo) : that.normalFormMorphInfo != null) return false; return true; @@ -75,11 +86,12 @@ public class SimpleSuffixHeuristic { result = 31 * result + (actualSuffixLength != null ? actualSuffixLength.hashCode() : 0); result = 31 * result + (normalSuffix != null ? normalSuffix.hashCode() : 0); result = 31 * result + (morphInfoCode != null ? morphInfoCode.hashCode() : 0); + result = 31 * result + (normalFormMorphInfo != null ? normalFormMorphInfo.hashCode() : 0); return result; } @Override public String toString() { - return formSuffix + " " + actualSuffixLength + " " + normalSuffix + " " + morphInfoCode; + return formSuffix + " " + actualSuffixLength + " " + normalSuffix + " " + morphInfoCode + " nf " + normalFormMorphInfo; } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/heuristic/StatiticsCollectors.java b/src/main/java/org/apache/lucene/russian/morphology/heuristic/StatiticsCollectors.java index b9dc025..3e66ba2 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/StatiticsCollectors.java +++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/StatiticsCollectors.java @@ -39,8 +39,9 @@ public class StatiticsCollectors implements WordProccessor { private Integer ignoredCount = 0; public void proccess(WordCard wordCard) { + String normalStringMorph = wordCard.getWordsFroms().get(0).getCode(); for (FlexiaModel fm : wordCard.getWordsFroms()) { - SimpleSuffixHeuristic simpleSuffixHeuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm); + SimpleSuffixHeuristic simpleSuffixHeuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph); if (simpleSuffixHeuristic == null) continue; SuffixCounter suffixCounter = statititics.get(simpleSuffixHeuristic); if (suffixCounter == null) { @@ -61,13 +62,13 @@ public class StatiticsCollectors implements WordProccessor { return statititics; } - private SimpleSuffixHeuristic createEvristic(String wordBase, String canonicalSuffix, FlexiaModel fm) { + private SimpleSuffixHeuristic createEvristic(String wordBase, String canonicalSuffix, FlexiaModel fm, String normalSuffixForm) { String form = fm.create(wordBase); int startSymbol = form.length() > RussianSuffixDecoderEncoder.suffixLength ? form.length() - RussianSuffixDecoderEncoder.suffixLength : 0; String formSuffix = form.substring(startSymbol); String actualSuffix = fm.getSuffix(); Integer actualSuffixLengh = actualSuffix.length(); - return new SimpleSuffixHeuristic(formSuffix, actualSuffixLengh, canonicalSuffix, fm.getCode()); + return new SimpleSuffixHeuristic(formSuffix, actualSuffixLengh, canonicalSuffix, fm.getCode(), normalSuffixForm); }