diff --git a/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java b/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java index 3588c9b..bbe8e32 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java +++ b/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java @@ -18,15 +18,18 @@ package org.apache.lucene.russian.morphology; import org.apache.lucene.russian.morphology.dictonary.DictonaryReader; import org.apache.lucene.russian.morphology.dictonary.FrequentyReader; +import org.apache.lucene.russian.morphology.dictonary.GrammaReader; import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader; -import org.apache.lucene.russian.morphology.heuristic.Heuristic; +import org.apache.lucene.russian.morphology.heuristic.HeuristicBySuffixLegth; import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors; import org.apache.lucene.russian.morphology.heuristic.SuffixCounter; +import org.apache.lucene.russian.morphology.heuristic.SuffixHeuristic; import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.Set; +import java.util.TreeMap; public class HeuristicBuilder { @@ -35,10 +38,11 @@ public class HeuristicBuilder { Set form = formReader.getIngnoredFroms(); FrequentyReader frequentyReader = new FrequentyReader("data/lemma.num"); - + GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab"); DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form); - StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read()); + + StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read(), grammaInfo); dictonaryReader.proccess(statiticsCollectors); Collection counterCollection = statiticsCollectors.getStatititics().values(); Object[] objects = counterCollection.toArray(); @@ -48,11 +52,52 @@ public class HeuristicBuilder { System.out.println(objects[i]); } - final Heuristic heuristic = new Heuristic(); + final HeuristicBySuffixLegth heuristic = new HeuristicBySuffixLegth(); for (int i = 0; i < objects.length; i++) { heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic()); } - heuristic.writeToFile("russianSuffixesHeuristic.txt"); + TreeMap map = new TreeMap(); + + int ct = 0; + for (Set s : heuristic.getHeuristics().values()) { + Integer d = map.get(s.size()); + map.put(s.size(), 1 + (d == null ? 0 : d)); + if (s.size() == 1) { + ct++; + continue; + } + SuffixHeuristic heuristic1 = s.iterator().next(); + Integer sufixSize = heuristic1.getActualSuffixLength(); + String normalSuffix = heuristic1.getNormalFromSuffix(); + if (heuristic1.getFormSuffix().length() < 6) { + ct++; + continue; + } + Boolean flag = true; + if (sufixSize > 3) continue; + for (SuffixHeuristic sh : s) { + flag = flag && (sufixSize.equals(sh.getActualSuffixLength())) + && (normalSuffix.equals(sh.getNormalFromSuffix())); + } + if (flag) { + System.out.println(s); + ct++; + } + //HashSet integers = new HashSet(); +// for(SuffixHeuristic sh:s){ +// integers.add(sh.getMorphInfoCode()); +// } +// if(s.size() == integers.size()){ +// ct++; +// }else{ +// if(s.size() == 2) System.out.println(s); +// } + } + System.out.println(objects.length); + System.out.println(heuristic.getHeuristics().size()); + System.out.println(ct); + System.out.println(map); + //heuristic.writeToFile("russianSuffixesHeuristic.txt"); } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java b/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java index 985ce5e..3802050 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java +++ b/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java @@ -24,7 +24,7 @@ package org.apache.lucene.russian.morphology; */ public class RussianSuffixDecoderEncoder { public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071; - public static final int SUFFIX_LENGTH = 7; + public static final int SUFFIX_LENGTH = 6; public static final int EE_CHAR = 34; public static final int E_CHAR = 6; public static final int DASH_CHAR = 45; diff --git a/src/main/java/org/apache/lucene/russian/morphology/Test.java b/src/main/java/org/apache/lucene/russian/morphology/Test.java new file mode 100644 index 0000000..3badcdd --- /dev/null +++ b/src/main/java/org/apache/lucene/russian/morphology/Test.java @@ -0,0 +1,13 @@ +package org.apache.lucene.russian.morphology; + +import org.apache.lucene.russian.morphology.dictonary.GrammaReader; + +import java.io.IOException; + + +public class Test { + public static void main(String[] args) throws IOException { + GrammaReader grammaReader = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab"); + System.out.println(grammaReader.getInversIndex().size()); + } +} diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java index 42a2b11..d0e55b6 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java +++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java @@ -63,17 +63,18 @@ public class DictonaryReader { int count = Integer.valueOf(s); for (int i = 0; i < count; i++) { s = reader.readLine(); - if (i % 10000 == 0) System.out.println("Proccess " + i + " word of " + count); + if (i % 10000 == 0) System.out.println("Proccess " + i + " wordBase of " + count); String[] wd = s.split(" "); - String word = wd[0].toLowerCase(); - if (word.startsWith("-")) continue; - word = "#".equals(word) ? "" : word; + String wordBase = wd[0].toLowerCase(); + if (wordBase.startsWith("-")) continue; + wordBase = "#".equals(wordBase) ? "" : wordBase; List models = wordsFlexias.get(Integer.valueOf(wd[1])); - if (models.size() > 0 && !ingnoredForm.contains(models.get(0).getCode())) { - WordCard card = new WordCard(cleanString(models.get(0).create(word))); + FlexiaModel flexiaModel = models.get(0); + if (models.size() > 0 && !ingnoredForm.contains(flexiaModel.getCode())) { + WordCard card = new WordCard(cleanString(flexiaModel.create(wordBase)), cleanString(wordBase), flexiaModel.getSuffix()); for (FlexiaModel fm : models) { - card.addFrom(cleanString(fm.create(word))); + card.addFlexia(fm); } wordProccessor.proccess(card); } @@ -118,9 +119,10 @@ public class DictonaryReader { private void addFlexia(ArrayList flexiaModelArrayList, String line) { String[] fl = line.split("\\*"); // we inored all forms thats - // if (fl.length == 3) - // flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase())); - if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), "")); + if (fl.length == 3) { + flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase()))); + } + if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), "")); } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/GrammaReader.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/GrammaReader.java new file mode 100644 index 0000000..038a737 --- /dev/null +++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/GrammaReader.java @@ -0,0 +1,58 @@ +package org.apache.lucene.russian.morphology.dictonary; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.HashMap; +import java.util.Map; + +//todo spleet this class on two. +public class GrammaReader { + private String fileName; + private String fileEncoding = "windows-1251"; + private Map grammaInfo = new HashMap(); + private Map inversIndex = new HashMap(); + + public GrammaReader(String fileName) throws IOException { + this.fileName = fileName; + setUp(); + } + + public GrammaReader(String fileName, String fileEncoding) throws IOException { + this.fileName = fileName; + this.fileEncoding = fileEncoding; + setUp(); + } + + private void setUp() throws IOException { + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), fileEncoding)); + String line = bufferedReader.readLine(); + while (line != null) { + line = line.trim(); + if (!line.startsWith("//") && line.length() > 0) { + String[] strings = line.split(" ", 2); + Integer i = grammaInfo.size(); + inversIndex.put(strings[0], i); + grammaInfo.put(i, strings[1]); + } + line = bufferedReader.readLine(); + } + } + + public Map getGrammaInfo() { + return grammaInfo; + } + + public void setGrammaInfo(Map grammaInfo) { + this.grammaInfo = grammaInfo; + } + + public Map getInversIndex() { + return inversIndex; + } + + public void setInversIndex(Map inversIndex) { + this.inversIndex = inversIndex; + } +} diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java index 770bca3..881595a 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java +++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java @@ -24,21 +24,33 @@ import java.util.List; */ public class WordCard { private String canonicalFrom; - private List wordsFroms = new ArrayList(); + private String base; + private String canonicalSuffix; + private List wordsFroms = new ArrayList(); - protected WordCard(String canonicalFrom) { + public WordCard(String canonicalFrom, String base, String canonicalSuffix) { this.canonicalFrom = canonicalFrom; + this.canonicalSuffix = canonicalSuffix; + this.base = base; } - protected void addFrom(String word) { - wordsFroms.add(word); + public void addFlexia(FlexiaModel flexiaModel) { + wordsFroms.add(flexiaModel); } public String getCanonicalFrom() { return canonicalFrom; } - public List getWordsFroms() { + public String getCanonicalSuffix() { + return canonicalSuffix; + } + + public String getBase() { + return base; + } + + public List getWordsFroms() { return wordsFroms; } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/heuristic/Heuristic.java b/src/main/java/org/apache/lucene/russian/morphology/heuristic/Heuristic.java index 9718bcc..0463511 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/Heuristic.java +++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/Heuristic.java @@ -29,11 +29,11 @@ public class Heuristic { private TreeMap encodedSuffixesPairs = new TreeMap(); public void addHeuristic(SuffixHeuristic suffixHeuristic) { - Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix()); - Long longs = encodedSuffixesPairs.get(suffix); - if (longs == null) { - encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixHeuristic.getNormalSuffix())); - } +// Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix()); +// Long longs = encodedSuffixesPairs.get(suffix); +// if (longs == null) { +// encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixHeuristic.getNormalSuffix())); +// } } public String getNormalForm(String form) { @@ -49,6 +49,10 @@ public class Heuristic { return form; } + public Integer getAmount() { + return encodedSuffixesPairs.size(); + } + public void readFromFile(String file) throws IOException { BufferedReader reader = new BufferedReader(new FileReader(file)); String s = reader.readLine(); diff --git a/src/main/java/org/apache/lucene/russian/morphology/heuristic/HeuristicBySuffixLegth.java b/src/main/java/org/apache/lucene/russian/morphology/heuristic/HeuristicBySuffixLegth.java new file mode 100644 index 0000000..d6d736e --- /dev/null +++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/HeuristicBySuffixLegth.java @@ -0,0 +1,27 @@ +package org.apache.lucene.russian.morphology.heuristic; + +import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + + +public class HeuristicBySuffixLegth { + private Map> heuristics = new HashMap>(); + + public void addHeuristic(SuffixHeuristic suffixHeuristic) { + Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix()); + Set suffixHeuristics = heuristics.get(suffix); + if (suffixHeuristics == null) { + suffixHeuristics = new HashSet(); + heuristics.put(suffix, suffixHeuristics); + } + suffixHeuristics.add(suffixHeuristic); + } + + public Map> getHeuristics() { + return heuristics; + } +} diff --git a/src/main/java/org/apache/lucene/russian/morphology/heuristic/StatiticsCollectors.java b/src/main/java/org/apache/lucene/russian/morphology/heuristic/StatiticsCollectors.java index f3557fb..db9782b 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/StatiticsCollectors.java +++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/StatiticsCollectors.java @@ -17,6 +17,8 @@ package org.apache.lucene.russian.morphology.heuristic; import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; +import org.apache.lucene.russian.morphology.dictonary.FlexiaModel; +import org.apache.lucene.russian.morphology.dictonary.GrammaReader; import org.apache.lucene.russian.morphology.dictonary.WordCard; import org.apache.lucene.russian.morphology.dictonary.WordProccessor; @@ -27,16 +29,18 @@ import java.util.Map; public class StatiticsCollectors implements WordProccessor { Map statititics = new HashMap(); private Map wordsFreq; + private GrammaReader grammaInfo; - public StatiticsCollectors(Map wordsFreq) { + public StatiticsCollectors(Map wordsFreq, GrammaReader grammaInfo) { this.wordsFreq = wordsFreq; + this.grammaInfo = grammaInfo; } private Integer ignoredCount = 0; public void proccess(WordCard wordCard) { - for (String form : wordCard.getWordsFroms()) { - SuffixHeuristic suffixHeuristic = createEvristic(wordCard.getCanonicalFrom(), form); + for (FlexiaModel fm : wordCard.getWordsFroms()) { + SuffixHeuristic suffixHeuristic = createEvristic(wordCard.getCanonicalFrom(), wordCard.getCanonicalSuffix(), fm); if (suffixHeuristic == null) continue; SuffixCounter suffixCounter = statititics.get(suffixHeuristic); if (suffixCounter == null) { @@ -57,19 +61,23 @@ public class StatiticsCollectors implements WordProccessor { return statititics; } - private SuffixHeuristic createEvristic(String word, String form) { + private SuffixHeuristic createEvristic(String wordBase, String canonicalSuffix, FlexiaModel fm) { + String form = fm.create(wordBase); int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0; String formSuffix = form.substring(startSymbol); - if (word.length() < startSymbol) { - ignoredCount++; - return null; - } - String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : ""; - if (wordSuffix.length() > 12) { - System.out.println(word + " " + form); - return null; - } - return new SuffixHeuristic(formSuffix, wordSuffix); + String actualSuffix = fm.getSuffix(); + Integer actualSuffixLengh = actualSuffix.length(); +// if (word.length() < startSymbol) { +// ignoredCount++; +// return null; +// } +// String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : ""; +// if (wordSuffix.length() > 12) { +// System.out.println(word + " " + form); +// return null; +// } +// return new SuffixHeuristic(formSuffix, wordSuffix); + return new SuffixHeuristic(formSuffix, actualSuffixLengh, canonicalSuffix, fm.getCode()); } diff --git a/src/main/java/org/apache/lucene/russian/morphology/heuristic/SuffixHeuristic.java b/src/main/java/org/apache/lucene/russian/morphology/heuristic/SuffixHeuristic.java index df84bc0..8dfadc8 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/heuristic/SuffixHeuristic.java +++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/SuffixHeuristic.java @@ -24,27 +24,31 @@ package org.apache.lucene.russian.morphology.heuristic; */ public class SuffixHeuristic { private String formSuffix; - private String normalSuffix; + private Integer actualSuffixLength; + private String normalFromSuffix; + private String morphInfoCode; - public SuffixHeuristic(String formSuffix, String normalSuffix) { + public SuffixHeuristic(String formSuffix, Integer actualSuffixLength, String normalFromSuffix, String morphInfoCode) { this.formSuffix = formSuffix; - this.normalSuffix = normalSuffix; + this.actualSuffixLength = actualSuffixLength; + this.normalFromSuffix = normalFromSuffix; + this.morphInfoCode = morphInfoCode; } public String getFormSuffix() { return formSuffix; } - public void setFormSuffix(String formSuffix) { - this.formSuffix = formSuffix; + public Integer getActualSuffixLength() { + return actualSuffixLength; } - public String getNormalSuffix() { - return normalSuffix; + public String getNormalFromSuffix() { + return normalFromSuffix; } - public void setNormalSuffix(String normalSuffix) { - this.normalSuffix = normalSuffix; + public String getMorphInfoCode() { + return morphInfoCode; } @Override @@ -54,24 +58,28 @@ public class SuffixHeuristic { SuffixHeuristic that = (SuffixHeuristic) o; - if (!formSuffix.equals(that.formSuffix)) return false; - if (!normalSuffix.equals(that.normalSuffix)) return false; + if (actualSuffixLength != null ? !actualSuffixLength.equals(that.actualSuffixLength) : that.actualSuffixLength != null) + return false; + if (formSuffix != null ? !formSuffix.equals(that.formSuffix) : that.formSuffix != null) return false; + if (morphInfoCode != null ? !morphInfoCode.equals(that.morphInfoCode) : that.morphInfoCode != null) + return false; + if (normalFromSuffix != null ? !normalFromSuffix.equals(that.normalFromSuffix) : that.normalFromSuffix != null) + return false; return true; } @Override public int hashCode() { - int result = formSuffix.hashCode(); - result = 31 * result + normalSuffix.hashCode(); + int result = formSuffix != null ? formSuffix.hashCode() : 0; + result = 31 * result + (actualSuffixLength != null ? actualSuffixLength.hashCode() : 0); + result = 31 * result + (normalFromSuffix != null ? normalFromSuffix.hashCode() : 0); + result = 31 * result + (morphInfoCode != null ? morphInfoCode.hashCode() : 0); return result; } @Override public String toString() { - return "SuffixHeuristic{" + - "formSuffix='" + formSuffix + '\'' + - ", normalSuffix='" + normalSuffix + '\'' + - '}'; + return formSuffix + " " + actualSuffixLength + " " + normalFromSuffix + " " + morphInfoCode; } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/heuristic/SuffixHeuristicMerger.java b/src/main/java/org/apache/lucene/russian/morphology/heuristic/SuffixHeuristicMerger.java new file mode 100644 index 0000000..cbaac4a --- /dev/null +++ b/src/main/java/org/apache/lucene/russian/morphology/heuristic/SuffixHeuristicMerger.java @@ -0,0 +1,13 @@ +package org.apache.lucene.russian.morphology.heuristic; + + +public class SuffixHeuristicMerger { + + public SuffixHeuristic merge(SuffixHeuristic one, SuffixHeuristic two) { + if (!one.getMorphInfoCode().equals(two.getMorphInfoCode())) + return null; + SuffixHeuristic min = one.getActualSuffixLength() > two.getActualSuffixLength() ? two : one; + + return null; + } +} diff --git a/src/test/resources/org/apache/lucene/russian/morphology/analayzer/suffix-heuristic-test-data.txt b/src/test/resources/org/apache/lucene/russian/morphology/analayzer/suffix-heuristic-test-data.txt index b04b419..cd7f12c 100644 --- a/src/test/resources/org/apache/lucene/russian/morphology/analayzer/suffix-heuristic-test-data.txt +++ b/src/test/resources/org/apache/lucene/russian/morphology/analayzer/suffix-heuristic-test-data.txt @@ -5,4 +5,5 @@ поэтическая поэтический произошло произойти test test -ананасов ананас \ No newline at end of file +ананасов ананас +встовашего встовать \ No newline at end of file