From 58e0d4c49cefa82909947c703859b71f925b34b3 Mon Sep 17 00:00:00 2001 From: "alexander.a.kuznetsov" Date: Sat, 15 Aug 2009 20:10:49 +0000 Subject: [PATCH] working on new model for morphology git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@44 d817d54c-26ab-11de-abc9-2f7d1455ff7a --- .../russian/morphology/HeuristicBuilder.java | 2 +- .../morphology/StatiticsCollector.java | 77 ++++++------------- .../lucene/russian/morphology/Test.java | 15 +++- .../morphology/dictonary/DictonaryReader.java | 3 +- .../morphology/informations/Heuristic.java | 43 +++++++++++ .../{Splitter.java => Morph.java} | 7 +- 6 files changed, 86 insertions(+), 61 deletions(-) create mode 100644 src/main/java/org/apache/lucene/russian/morphology/informations/Heuristic.java rename src/main/java/org/apache/lucene/russian/morphology/informations/{Splitter.java => Morph.java} (88%) diff --git a/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java b/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java index 14452f6..744aa0b 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java +++ b/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java @@ -34,7 +34,7 @@ public class HeuristicBuilder { GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab"); DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form); - StatiticsCollector statiticsCollector = new StatiticsCollector(); + StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo); dictonaryReader.proccess(statiticsCollector); statiticsCollector.printInfo(); diff --git a/src/main/java/org/apache/lucene/russian/morphology/StatiticsCollector.java b/src/main/java/org/apache/lucene/russian/morphology/StatiticsCollector.java index 73ec3d3..9d9b199 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/StatiticsCollector.java +++ b/src/main/java/org/apache/lucene/russian/morphology/StatiticsCollector.java @@ -18,17 +18,27 @@ package org.apache.lucene.russian.morphology; import org.apache.lucene.russian.morphology.dictonary.FlexiaModel; +import org.apache.lucene.russian.morphology.dictonary.GrammaReader; import org.apache.lucene.russian.morphology.dictonary.WordCard; import org.apache.lucene.russian.morphology.dictonary.WordProccessor; -import org.apache.lucene.russian.morphology.informations.Splitter; +import org.apache.lucene.russian.morphology.informations.Heuristic; +import org.apache.lucene.russian.morphology.informations.Morph; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.ObjectOutputStream; import java.util.*; public class StatiticsCollector implements WordProccessor { private TreeMap> inversIndex = new TreeMap>(); private Set noramlSuffix = new HashSet(); + private Set> ds = new HashSet>(); + private GrammaReader grammaReader; + + public StatiticsCollector(GrammaReader grammaReader) { + this.grammaReader = grammaReader; + } public void proccess(WordCard wordCard) throws IOException { String normalStringMorph = wordCard.getWordsFroms().get(0).getCode(); @@ -60,6 +70,7 @@ public class StatiticsCollector implements WordProccessor { dist.put(key.length(), 1 + (d == null ? 0 : d)); prevSet = currentSet; count++; + ds.add(currentSet); for (Heuristic h : currentSet) { noramlSuffix.add(h); } @@ -69,14 +80,11 @@ public class StatiticsCollector implements WordProccessor { System.out.println("All ivers words " + inversIndex.size()); System.out.println(dist); System.out.println("Diffirent suffix counts " + noramlSuffix.size()); - - int maxLegth = Integer.MIN_VALUE; - for (Heuristic n : noramlSuffix) { - if (n.actualNormalSuffix.length() > maxLegth) maxLegth = n.actualNormalSuffix.length(); - } + System.out.println("diffirent rule count " + ds.size()); + ObjectOutputStream objectOutputStream = new ObjectOutputStream(new FileOutputStream("suffixes")); ArrayList list = new ArrayList(noramlSuffix); - //new FileWriter() - System.out.println("Max lenght " + maxLegth); + objectOutputStream.writeObject(list); + objectOutputStream.close(); int[][] ints = new int[count][]; count = 0; @@ -89,8 +97,8 @@ public class StatiticsCollector implements WordProccessor { prevSet = currentSet; } } - Splitter splitter = new Splitter(ints); - splitter.writeToFile("sep.txt"); + Morph morph = new Morph(ints); + morph.writeToFile("sep.txt"); } @@ -109,7 +117,11 @@ public class StatiticsCollector implements WordProccessor { Integer length = getCommonLength(form, normalForm); Integer actualSuffixLengh = form.length() - length; String actualNormalSuffix = normalForm.substring(length); - return new Heuristic(actualSuffixLengh, actualNormalSuffix, fm.getCode(), normalSuffixForm); + Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode().substring(0, 2)); + //System.out.println(fm.getCode() + " " + integer); + Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm.substring(0, 2)); + //System.out.println(normalSuffixForm + " " + nf); + return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue()); } public static Integer getCommonLength(String s1, String s2) { @@ -119,47 +131,4 @@ public class StatiticsCollector implements WordProccessor { } return length; } - - - private class Heuristic { - Integer actualSuffixLengh; - String actualNormalSuffix; - String formMorphInfo; - String normalSuffixForm; - - private Heuristic(Integer actualSuffixLengh, String actualNormalSuffix, String formMorphInfo, String normalSuffixForm) { - this.actualSuffixLengh = actualSuffixLengh; - this.actualNormalSuffix = actualNormalSuffix; - this.formMorphInfo = formMorphInfo; - this.normalSuffixForm = normalSuffixForm; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - Heuristic heuristic = (Heuristic) o; - - if (actualNormalSuffix != null ? !actualNormalSuffix.equals(heuristic.actualNormalSuffix) : heuristic.actualNormalSuffix != null) - return false; - if (actualSuffixLengh != null ? !actualSuffixLengh.equals(heuristic.actualSuffixLengh) : heuristic.actualSuffixLengh != null) - return false; - if (formMorphInfo != null ? !formMorphInfo.equals(heuristic.formMorphInfo) : heuristic.formMorphInfo != null) - return false; - if (normalSuffixForm != null ? !normalSuffixForm.equals(heuristic.normalSuffixForm) : heuristic.normalSuffixForm != null) - return false; - - return true; - } - - @Override - public int hashCode() { - int result = actualSuffixLengh != null ? actualSuffixLengh.hashCode() : 0; - result = 31 * result + (actualNormalSuffix != null ? actualNormalSuffix.hashCode() : 0); - result = 31 * result + (formMorphInfo != null ? formMorphInfo.hashCode() : 0); - result = 31 * result + (normalSuffixForm != null ? normalSuffixForm.hashCode() : 0); - return result; - } - } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/Test.java b/src/main/java/org/apache/lucene/russian/morphology/Test.java index f6ebb7f..e955099 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/Test.java +++ b/src/main/java/org/apache/lucene/russian/morphology/Test.java @@ -15,7 +15,13 @@ */ package org.apache.lucene.russian.morphology; +import org.apache.lucene.russian.morphology.informations.Heuristic; +import org.apache.lucene.russian.morphology.informations.Morph; + +import java.io.FileInputStream; import java.io.IOException; +import java.io.ObjectInputStream; +import java.util.ArrayList; /** * Created by IntelliJ IDEA. @@ -26,8 +32,13 @@ import java.io.IOException; */ public class Test { - public static void main(String[] args) throws IOException { - //Splitter splitter = new Splitter("sep.txt"); + public static void main(String[] args) throws IOException, ClassNotFoundException { + // + Morph splitter = new Morph("sep.txt"); + ObjectInputStream inputStream = new ObjectInputStream(new FileInputStream("suffixes")); + ArrayList hr = (ArrayList) inputStream.readObject(); + System.gc(); + System.out.println("Ready"); System.in.read(); } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java index d0e55b6..d0b8350 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java +++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java @@ -120,7 +120,8 @@ public class DictonaryReader { String[] fl = line.split("\\*"); // we inored all forms thats if (fl.length == 3) { - flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase()))); + System.out.println(line); + // flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase()))); } if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), "")); } diff --git a/src/main/java/org/apache/lucene/russian/morphology/informations/Heuristic.java b/src/main/java/org/apache/lucene/russian/morphology/informations/Heuristic.java new file mode 100644 index 0000000..a4670ad --- /dev/null +++ b/src/main/java/org/apache/lucene/russian/morphology/informations/Heuristic.java @@ -0,0 +1,43 @@ +package org.apache.lucene.russian.morphology.informations; + +import java.io.Serializable; + + +public class Heuristic implements Serializable { + byte actualSuffixLengh; + String actualNormalSuffix; + short formMorphInfo; + short normalSuffixForm; + + public Heuristic(byte actualSuffixLengh, String actualNormalSuffix, short formMorphInfo, short normalSuffixForm) { + this.actualSuffixLengh = actualSuffixLengh; + this.actualNormalSuffix = actualNormalSuffix; + this.formMorphInfo = formMorphInfo; + this.normalSuffixForm = normalSuffixForm; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Heuristic heuristic = (Heuristic) o; + + if (actualSuffixLengh != heuristic.actualSuffixLengh) return false; + if (formMorphInfo != heuristic.formMorphInfo) return false; + if (normalSuffixForm != heuristic.normalSuffixForm) return false; + if (actualNormalSuffix != null ? !actualNormalSuffix.equals(heuristic.actualNormalSuffix) : heuristic.actualNormalSuffix != null) + return false; + + return true; + } + + @Override + public int hashCode() { + int result = (int) actualSuffixLengh; + result = 31 * result + (actualNormalSuffix != null ? actualNormalSuffix.hashCode() : 0); + result = 31 * result + (int) formMorphInfo; + result = 31 * result + (int) normalSuffixForm; + return result; + } +} diff --git a/src/main/java/org/apache/lucene/russian/morphology/informations/Splitter.java b/src/main/java/org/apache/lucene/russian/morphology/informations/Morph.java similarity index 88% rename from src/main/java/org/apache/lucene/russian/morphology/informations/Splitter.java rename to src/main/java/org/apache/lucene/russian/morphology/informations/Morph.java index 1134ae8..6bf9c7d 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/informations/Splitter.java +++ b/src/main/java/org/apache/lucene/russian/morphology/informations/Morph.java @@ -6,14 +6,15 @@ import java.io.FileWriter; import java.io.IOException; -public class Splitter { +public class Morph { int[][] separators; - public Splitter(String fileName) throws IOException { + + public Morph(String fileName) throws IOException { readFromFile(fileName); } - public Splitter(int[][] separators) { + public Morph(int[][] separators) { this.separators = separators; }