diff --git a/src/main/java/org/apache/lucene/russian/morphology/StatiticsCollector.java b/src/main/java/org/apache/lucene/russian/morphology/StatiticsCollector.java index c70a04e..9f533e0 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/StatiticsCollector.java +++ b/src/main/java/org/apache/lucene/russian/morphology/StatiticsCollector.java @@ -23,7 +23,6 @@ import org.apache.lucene.russian.morphology.dictonary.WordCard; import org.apache.lucene.russian.morphology.dictonary.WordProccessor; import org.apache.lucene.russian.morphology.informations.Heuristic; import org.apache.lucene.russian.morphology.informations.Morph; -import org.apache.lucene.russian.morphology.informations.RuleInfo; import java.io.IOException; import java.util.*; @@ -137,7 +136,5 @@ public class StatiticsCollector implements WordProccessor { return length; } - public RuleInfo getRuleInfo() { - return null; - } + } diff --git a/src/main/java/org/apache/lucene/russian/morphology/Test.java b/src/main/java/org/apache/lucene/russian/morphology/Test.java index 1406f69..ff63cb4 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/Test.java +++ b/src/main/java/org/apache/lucene/russian/morphology/Test.java @@ -15,12 +15,13 @@ */ package org.apache.lucene.russian.morphology; +import org.apache.lucene.russian.morphology.informations.Heuristic; import org.apache.lucene.russian.morphology.informations.Morph; -import java.io.BufferedReader; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; +import java.util.Arrays; +import java.util.TreeMap; +import java.util.TreeSet; /** * Created by IntelliJ IDEA. @@ -34,12 +35,26 @@ public class Test { public static void main(String[] args) throws IOException, ClassNotFoundException { // Morph splitter = new Morph("sep.txt"); - InputStream stream = Test.class.getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt"); - BufferedReader stream1 = new BufferedReader(new InputStreamReader(stream)); - String s = stream1.readLine().trim().toLowerCase(); - for (String w : s.split(" ")) { - System.out.println(splitter.getMorhInfo(w)); + TreeSet shorts = new TreeSet(); + int count = 0; + TreeMap rulesStat = new TreeMap(); + for (Heuristic[] heuristics : splitter.getRules()) { + Integer d = rulesStat.get(heuristics.length); + rulesStat.put(heuristics.length, 1 + (d == null ? 0 : d)); + boolean flag = true; + short actualSuffixLenght = heuristics[0].getActualSuffixLengh(); + String normalSuffix = heuristics[0].getActualNormalSuffix(); + for (Heuristic heuristic : heuristics) { + flag = flag && (heuristic.getActualSuffixLengh() == actualSuffixLenght) + && normalSuffix.equals(heuristic.getActualNormalSuffix()); + } + if (!flag) { + System.out.println(Arrays.asList(heuristics)); + count++; + } } + System.out.println(count); + System.out.println(rulesStat); System.gc(); System.out.println("Ready"); System.in.read(); diff --git a/src/main/java/org/apache/lucene/russian/morphology/informations/GrammaInfo.java b/src/main/java/org/apache/lucene/russian/morphology/informations/GrammaInfo.java deleted file mode 100644 index 911767c..0000000 --- a/src/main/java/org/apache/lucene/russian/morphology/informations/GrammaInfo.java +++ /dev/null @@ -1,32 +0,0 @@ -/** - * Copyright 2009 Alexander Kuznetsov - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.russian.morphology.informations; - -import java.io.Serializable; - - -public class GrammaInfo implements Serializable { - private String[] grammaInfo; - - public GrammaInfo(String[] grammaInfo) { - this.grammaInfo = grammaInfo; - } - - public String getInfo(Integer index) { - return grammaInfo[index]; - } -} diff --git a/src/main/java/org/apache/lucene/russian/morphology/informations/Heuristic.java b/src/main/java/org/apache/lucene/russian/morphology/informations/Heuristic.java index 570b308..10a9e80 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/informations/Heuristic.java +++ b/src/main/java/org/apache/lucene/russian/morphology/informations/Heuristic.java @@ -1,3 +1,18 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.russian.morphology.informations; import java.io.Serializable; diff --git a/src/main/java/org/apache/lucene/russian/morphology/informations/Morph.java b/src/main/java/org/apache/lucene/russian/morphology/informations/Morph.java index b6dd5f9..f4b7376 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/informations/Morph.java +++ b/src/main/java/org/apache/lucene/russian/morphology/informations/Morph.java @@ -1,3 +1,18 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.russian.morphology.informations; import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; @@ -28,6 +43,22 @@ public class Morph { this.grammaInfo = grammaInfo; } + public int[][] getSeparators() { + return separators; + } + + public short[] getRulesId() { + return rulesId; + } + + public Heuristic[][] getRules() { + return rules; + } + + public String[] getGrammaInfo() { + return grammaInfo; + } + public List getMorhInfo(String s) { ArrayList result = new ArrayList(); int[] ints = RussianSuffixDecoderEncoder.encodeToArray(revertWord(s)); @@ -58,7 +89,7 @@ public class Morph { if (compareToInts(ints, separators[mid]) >= 0) { return mid; } else { - return mid + 1; + return mid - 1; } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/informations/RuleInfo.java b/src/main/java/org/apache/lucene/russian/morphology/informations/RuleInfo.java deleted file mode 100644 index 94f13ed..0000000 --- a/src/main/java/org/apache/lucene/russian/morphology/informations/RuleInfo.java +++ /dev/null @@ -1,16 +0,0 @@ -package org.apache.lucene.russian.morphology.informations; - -import java.io.Serializable; - - -public class RuleInfo implements Serializable { - private Heuristic[][] rules; - - public RuleInfo(Heuristic[][] rules) { - this.rules = rules; - } - - public Heuristic[] getRule(short ruleId) { - return rules[ruleId]; - } -} diff --git a/src/test/resources/org/apache/lucene/russian/morphology/analayzer/russian-text.txt b/src/test/resources/org/apache/lucene/russian/morphology/analayzer/russian-text.txt index 88e3e54..128d153 100644 --- a/src/test/resources/org/apache/lucene/russian/morphology/analayzer/russian-text.txt +++ b/src/test/resources/org/apache/lucene/russian/morphology/analayzer/russian-text.txt @@ -1,3 +1,8 @@ +пушке А бутявка волит за напушкой Сяпала Калуша по напушке и увазила бутявку И волит Калушата калушаточки Бутявка Калушата присяпали и бутявку стрямкали И подудонились А Калуша волит Бутявка то некузявая Калушата бутявку вычучили Бутявка вздребезнулась сопритюкнулась и усяпала с напушки +А Калуша волит: +— Бутявок не трямкают. Бутявки дюбые и зюмо-зюмо некузявые. От бутявок дудонятся. +А бутявка волит за напушкой: +— Калушата подудонились! Калушата подудонились! Зюмо некузявые! Пуськи бятые! В условиях нарастающей пурги было сделано 4 успешных захода на посадку. "Все нормально, будем рекомендовать систему к внедрению". Рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются примерно один раз в две недели. \ No newline at end of file