working on new model

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@47 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov 2009-08-18 07:55:26 +00:00
parent c03babf709
commit 394fb6a621
7 changed files with 76 additions and 61 deletions

View File

@ -23,7 +23,6 @@ import org.apache.lucene.russian.morphology.dictonary.WordCard;
import org.apache.lucene.russian.morphology.dictonary.WordProccessor; import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
import org.apache.lucene.russian.morphology.informations.Heuristic; import org.apache.lucene.russian.morphology.informations.Heuristic;
import org.apache.lucene.russian.morphology.informations.Morph; import org.apache.lucene.russian.morphology.informations.Morph;
import org.apache.lucene.russian.morphology.informations.RuleInfo;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.*;
@ -137,7 +136,5 @@ public class StatiticsCollector implements WordProccessor {
return length; return length;
} }
public RuleInfo getRuleInfo() {
return null;
}
} }

View File

@ -15,12 +15,13 @@
*/ */
package org.apache.lucene.russian.morphology; package org.apache.lucene.russian.morphology;
import org.apache.lucene.russian.morphology.informations.Heuristic;
import org.apache.lucene.russian.morphology.informations.Morph; import org.apache.lucene.russian.morphology.informations.Morph;
import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.util.Arrays;
import java.io.InputStreamReader; import java.util.TreeMap;
import java.util.TreeSet;
/** /**
* Created by IntelliJ IDEA. * Created by IntelliJ IDEA.
@ -34,12 +35,26 @@ public class Test {
public static void main(String[] args) throws IOException, ClassNotFoundException { public static void main(String[] args) throws IOException, ClassNotFoundException {
// //
Morph splitter = new Morph("sep.txt"); Morph splitter = new Morph("sep.txt");
InputStream stream = Test.class.getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt"); TreeSet<Short> shorts = new TreeSet<Short>();
BufferedReader stream1 = new BufferedReader(new InputStreamReader(stream)); int count = 0;
String s = stream1.readLine().trim().toLowerCase(); TreeMap<Integer, Integer> rulesStat = new TreeMap<Integer, Integer>();
for (String w : s.split(" ")) { for (Heuristic[] heuristics : splitter.getRules()) {
System.out.println(splitter.getMorhInfo(w)); Integer d = rulesStat.get(heuristics.length);
rulesStat.put(heuristics.length, 1 + (d == null ? 0 : d));
boolean flag = true;
short actualSuffixLenght = heuristics[0].getActualSuffixLengh();
String normalSuffix = heuristics[0].getActualNormalSuffix();
for (Heuristic heuristic : heuristics) {
flag = flag && (heuristic.getActualSuffixLengh() == actualSuffixLenght)
&& normalSuffix.equals(heuristic.getActualNormalSuffix());
}
if (!flag) {
System.out.println(Arrays.asList(heuristics));
count++;
}
} }
System.out.println(count);
System.out.println(rulesStat);
System.gc(); System.gc();
System.out.println("Ready"); System.out.println("Ready");
System.in.read(); System.in.read();

View File

@ -1,32 +0,0 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.informations;
import java.io.Serializable;
public class GrammaInfo implements Serializable {
private String[] grammaInfo;
public GrammaInfo(String[] grammaInfo) {
this.grammaInfo = grammaInfo;
}
public String getInfo(Integer index) {
return grammaInfo[index];
}
}

View File

@ -1,3 +1,18 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.informations; package org.apache.lucene.russian.morphology.informations;
import java.io.Serializable; import java.io.Serializable;

View File

@ -1,3 +1,18 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.informations; package org.apache.lucene.russian.morphology.informations;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
@ -28,6 +43,22 @@ public class Morph {
this.grammaInfo = grammaInfo; this.grammaInfo = grammaInfo;
} }
public int[][] getSeparators() {
return separators;
}
public short[] getRulesId() {
return rulesId;
}
public Heuristic[][] getRules() {
return rules;
}
public String[] getGrammaInfo() {
return grammaInfo;
}
public List<String> getMorhInfo(String s) { public List<String> getMorhInfo(String s) {
ArrayList<String> result = new ArrayList<String>(); ArrayList<String> result = new ArrayList<String>();
int[] ints = RussianSuffixDecoderEncoder.encodeToArray(revertWord(s)); int[] ints = RussianSuffixDecoderEncoder.encodeToArray(revertWord(s));
@ -58,7 +89,7 @@ public class Morph {
if (compareToInts(ints, separators[mid]) >= 0) { if (compareToInts(ints, separators[mid]) >= 0) {
return mid; return mid;
} else { } else {
return mid + 1; return mid - 1;
} }
} }

View File

@ -1,16 +0,0 @@
package org.apache.lucene.russian.morphology.informations;
import java.io.Serializable;
public class RuleInfo implements Serializable {
private Heuristic[][] rules;
public RuleInfo(Heuristic[][] rules) {
this.rules = rules;
}
public Heuristic[] getRule(short ruleId) {
return rules[ruleId];
}
}

View File

@ -1,3 +1,8 @@
пушке А бутявка волит за напушкой Сяпала Калуша по напушке и увазила бутявку И волит Калушата калушаточки Бутявка Калушата присяпали и бутявку стрямкали И подудонились А Калуша волит Бутявка то некузявая Калушата бутявку вычучили Бутявка вздребезнулась сопритюкнулась и усяпала с напушки
А Калуша волит:
— Бутявок не трямкают. Бутявки дюбые и зюмо-зюмо некузявые. От бутявок дудонятся.
А бутявка волит за напушкой:
— Калушата подудонились! Калушата подудонились! Зюмо некузявые! Пуськи бятые!
В условиях нарастающей пурги было сделано 4 успешных захода на посадку. "Все нормально, будем рекомендовать систему к внедрению". В условиях нарастающей пурги было сделано 4 успешных захода на посадку. "Все нормально, будем рекомендовать систему к внедрению".
Рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются Рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются
примерно один раз в две недели. примерно один раз в две недели.