diff --git a/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java b/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java index 744aa0b..b56ecd8 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java +++ b/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java @@ -36,7 +36,7 @@ public class HeuristicBuilder { StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo); dictonaryReader.proccess(statiticsCollector); - statiticsCollector.printInfo(); + statiticsCollector.saveHeuristic(); // StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read()); diff --git a/src/main/java/org/apache/lucene/russian/morphology/StatiticsCollector.java b/src/main/java/org/apache/lucene/russian/morphology/StatiticsCollector.java index 9d9b199..c70a04e 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/StatiticsCollector.java +++ b/src/main/java/org/apache/lucene/russian/morphology/StatiticsCollector.java @@ -23,19 +23,19 @@ import org.apache.lucene.russian.morphology.dictonary.WordCard; import org.apache.lucene.russian.morphology.dictonary.WordProccessor; import org.apache.lucene.russian.morphology.informations.Heuristic; import org.apache.lucene.russian.morphology.informations.Morph; +import org.apache.lucene.russian.morphology.informations.RuleInfo; -import java.io.FileOutputStream; import java.io.IOException; -import java.io.ObjectOutputStream; import java.util.*; public class StatiticsCollector implements WordProccessor { private TreeMap> inversIndex = new TreeMap>(); - private Set noramlSuffix = new HashSet(); - private Set> ds = new HashSet>(); + private Map, Integer> ruleInverIndex = new HashMap, Integer>(); + private List> rules = new ArrayList>(); private GrammaReader grammaReader; + public StatiticsCollector(GrammaReader grammaReader) { this.grammaReader = grammaReader; } @@ -44,7 +44,7 @@ public class StatiticsCollector implements WordProccessor { String normalStringMorph = wordCard.getWordsFroms().get(0).getCode(); String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); if (word.contains("-")) return; - //if(wordCard.getBase()+) + for (FlexiaModel fm : wordCard.getWordsFroms()) { Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph); String form = revertWord(fm.create(wordCard.getBase())); @@ -58,7 +58,7 @@ public class StatiticsCollector implements WordProccessor { } - public void printInfo() throws IOException { + public void saveHeuristic() throws IOException { Map dist = new TreeMap(); Set prevSet = null; @@ -70,36 +70,43 @@ public class StatiticsCollector implements WordProccessor { dist.put(key.length(), 1 + (d == null ? 0 : d)); prevSet = currentSet; count++; - ds.add(currentSet); - for (Heuristic h : currentSet) { - noramlSuffix.add(h); + if (!ruleInverIndex.containsKey(currentSet)) { + ruleInverIndex.put(currentSet, rules.size()); + rules.add(currentSet); } } } System.out.println("Word with diffirent rules " + count); System.out.println("All ivers words " + inversIndex.size()); System.out.println(dist); - System.out.println("Diffirent suffix counts " + noramlSuffix.size()); - System.out.println("diffirent rule count " + ds.size()); - ObjectOutputStream objectOutputStream = new ObjectOutputStream(new FileOutputStream("suffixes")); - ArrayList list = new ArrayList(noramlSuffix); - objectOutputStream.writeObject(list); - objectOutputStream.close(); + System.out.println("diffirent rule count " + ruleInverIndex.size()); + Heuristic[][] heuristics = new Heuristic[ruleInverIndex.size()][]; + int index = 0; + for (Set hs : rules) { + heuristics[index] = new Heuristic[hs.size()]; + int indexj = 0; + for (Heuristic h : hs) { + heuristics[index][indexj] = h; + indexj++; + } + index++; + } int[][] ints = new int[count][]; + short[] rulesId = new short[count]; count = 0; prevSet = null; for (String key : inversIndex.keySet()) { Set currentSet = inversIndex.get(key); if (!currentSet.equals(prevSet)) { ints[count] = RussianSuffixDecoderEncoder.encodeToArray(key); + rulesId[count] = (short) ruleInverIndex.get(currentSet).intValue(); count++; prevSet = currentSet; } } - Morph morph = new Morph(ints); + Morph morph = new Morph(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray()); morph.writeToFile("sep.txt"); - } private String revertWord(String s) { @@ -118,9 +125,7 @@ public class StatiticsCollector implements WordProccessor { Integer actualSuffixLengh = form.length() - length; String actualNormalSuffix = normalForm.substring(length); Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode().substring(0, 2)); - //System.out.println(fm.getCode() + " " + integer); Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm.substring(0, 2)); - //System.out.println(normalSuffixForm + " " + nf); return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue()); } @@ -131,4 +136,8 @@ public class StatiticsCollector implements WordProccessor { } return length; } + + public RuleInfo getRuleInfo() { + return null; + } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/Test.java b/src/main/java/org/apache/lucene/russian/morphology/Test.java index e955099..e6c0f53 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/Test.java +++ b/src/main/java/org/apache/lucene/russian/morphology/Test.java @@ -15,13 +15,9 @@ */ package org.apache.lucene.russian.morphology; -import org.apache.lucene.russian.morphology.informations.Heuristic; import org.apache.lucene.russian.morphology.informations.Morph; -import java.io.FileInputStream; import java.io.IOException; -import java.io.ObjectInputStream; -import java.util.ArrayList; /** * Created by IntelliJ IDEA. @@ -35,8 +31,6 @@ public class Test { public static void main(String[] args) throws IOException, ClassNotFoundException { // Morph splitter = new Morph("sep.txt"); - ObjectInputStream inputStream = new ObjectInputStream(new FileInputStream("suffixes")); - ArrayList hr = (ArrayList) inputStream.readObject(); System.gc(); System.out.println("Ready"); System.in.read(); diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/GrammaReader.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/GrammaReader.java index afaf366..f249c71 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/GrammaReader.java +++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/GrammaReader.java @@ -62,6 +62,10 @@ public class GrammaReader { return grammaInfo; } + public String[] getGrammaInfoAsArray() { + return grammaInfo.toArray(new String[grammaInfo.size()]); + } + public Map getGrammInversIndex() { return inversIndex; } diff --git a/src/main/java/org/apache/lucene/russian/morphology/informations/Heuristic.java b/src/main/java/org/apache/lucene/russian/morphology/informations/Heuristic.java index a4670ad..8f0c89b 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/informations/Heuristic.java +++ b/src/main/java/org/apache/lucene/russian/morphology/informations/Heuristic.java @@ -7,13 +7,21 @@ public class Heuristic implements Serializable { byte actualSuffixLengh; String actualNormalSuffix; short formMorphInfo; - short normalSuffixForm; + short normalFormMorphInfo; - public Heuristic(byte actualSuffixLengh, String actualNormalSuffix, short formMorphInfo, short normalSuffixForm) { + public Heuristic(String s) { + String[] strings = s.split("\\|"); + actualSuffixLengh = Byte.valueOf(strings[0]); + actualNormalSuffix = strings[1]; + formMorphInfo = Short.valueOf(strings[2]); + normalFormMorphInfo = Short.valueOf(strings[3]); + } + + public Heuristic(byte actualSuffixLengh, String actualNormalSuffix, short formMorphInfo, short normalFormMorphInfo) { this.actualSuffixLengh = actualSuffixLengh; this.actualNormalSuffix = actualNormalSuffix; this.formMorphInfo = formMorphInfo; - this.normalSuffixForm = normalSuffixForm; + this.normalFormMorphInfo = normalFormMorphInfo; } @Override @@ -25,7 +33,7 @@ public class Heuristic implements Serializable { if (actualSuffixLengh != heuristic.actualSuffixLengh) return false; if (formMorphInfo != heuristic.formMorphInfo) return false; - if (normalSuffixForm != heuristic.normalSuffixForm) return false; + if (normalFormMorphInfo != heuristic.normalFormMorphInfo) return false; if (actualNormalSuffix != null ? !actualNormalSuffix.equals(heuristic.actualNormalSuffix) : heuristic.actualNormalSuffix != null) return false; @@ -37,7 +45,12 @@ public class Heuristic implements Serializable { int result = (int) actualSuffixLengh; result = 31 * result + (actualNormalSuffix != null ? actualNormalSuffix.hashCode() : 0); result = 31 * result + (int) formMorphInfo; - result = 31 * result + (int) normalSuffixForm; + result = 31 * result + (int) normalFormMorphInfo; return result; } + + @Override + public String toString() { + return "" + actualSuffixLengh + "|" + actualNormalSuffix + "|" + formMorphInfo + "|" + normalFormMorphInfo; + } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/informations/Morph.java b/src/main/java/org/apache/lucene/russian/morphology/informations/Morph.java index 6bf9c7d..2ac081e 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/informations/Morph.java +++ b/src/main/java/org/apache/lucene/russian/morphology/informations/Morph.java @@ -8,14 +8,20 @@ import java.io.IOException; public class Morph { int[][] separators; + short[] rulesId; + Heuristic[][] rules; + String[] grammaInfo; public Morph(String fileName) throws IOException { readFromFile(fileName); } - public Morph(int[][] separators) { + public Morph(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) { this.separators = separators; + this.rulesId = rulesId; + this.rules = rules; + this.grammaInfo = grammaInfo; } public void writeToFile(String fileName) throws IOException { @@ -27,6 +33,20 @@ public class Morph { writer.write(j + "\n"); } } + for (short i : rulesId) { + writer.write(i + "\n"); + } + writer.write(rules.length + "\n"); + for (Heuristic[] heuristics : rules) { + writer.write(heuristics.length + "\n"); + for (Heuristic heuristic : heuristics) { + writer.write(heuristic.toString() + "\n"); + } + } + writer.write(grammaInfo.length + "\n"); + for (String s : grammaInfo) { + writer.write(s + "\n"); + } writer.close(); } @@ -43,6 +63,28 @@ public class Morph { separators[i][j] = Integer.valueOf(bufferedReader.readLine()); } } + rulesId = new short[amount]; + for (int i = 0; i < amount; i++) { + String s1 = bufferedReader.readLine(); + rulesId[i] = Short.valueOf(s1); + } + s = bufferedReader.readLine(); + amount = Integer.valueOf(s); + rules = new Heuristic[amount][]; + for (int i = 0; i < amount; i++) { + String s1 = bufferedReader.readLine(); + Integer ruleLenght = Integer.valueOf(s1); + rules[i] = new Heuristic[ruleLenght]; + for (int j = 0; j < ruleLenght; j++) { + rules[i][j] = new Heuristic(bufferedReader.readLine()); + } + } + s = bufferedReader.readLine(); + amount = Integer.valueOf(s); + grammaInfo = new String[amount]; + for (int i = 0; i < amount; i++) { + grammaInfo[i] = bufferedReader.readLine(); + } bufferedReader.close(); } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/informations/NormalSuffixCollection.java b/src/main/java/org/apache/lucene/russian/morphology/informations/NormalSuffixCollection.java deleted file mode 100644 index db38b79..0000000 --- a/src/main/java/org/apache/lucene/russian/morphology/informations/NormalSuffixCollection.java +++ /dev/null @@ -1,32 +0,0 @@ -/** - * Copyright 2009 Alexander Kuznetsov - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.russian.morphology.informations; - -import java.io.Serializable; - - -public class NormalSuffixCollection implements Serializable { - private String[] normalSuffixes; - - public NormalSuffixCollection(String[] normalSuffixes) { - this.normalSuffixes = normalSuffixes; - } - - public String getSuffix(Integer index) { - return normalSuffixes[index]; - } -} diff --git a/src/main/java/org/apache/lucene/russian/morphology/informations/RuleInfo.java b/src/main/java/org/apache/lucene/russian/morphology/informations/RuleInfo.java new file mode 100644 index 0000000..94f13ed --- /dev/null +++ b/src/main/java/org/apache/lucene/russian/morphology/informations/RuleInfo.java @@ -0,0 +1,16 @@ +package org.apache.lucene.russian.morphology.informations; + +import java.io.Serializable; + + +public class RuleInfo implements Serializable { + private Heuristic[][] rules; + + public RuleInfo(Heuristic[][] rules) { + this.rules = rules; + } + + public Heuristic[] getRule(short ruleId) { + return rules[ruleId]; + } +}