working on new model
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@47 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
c03babf709
commit
394fb6a621
@ -23,7 +23,6 @@ import org.apache.lucene.russian.morphology.dictonary.WordCard;
|
||||
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
|
||||
import org.apache.lucene.russian.morphology.informations.Heuristic;
|
||||
import org.apache.lucene.russian.morphology.informations.Morph;
|
||||
import org.apache.lucene.russian.morphology.informations.RuleInfo;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
@ -137,7 +136,5 @@ public class StatiticsCollector implements WordProccessor {
|
||||
return length;
|
||||
}
|
||||
|
||||
public RuleInfo getRuleInfo() {
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -15,12 +15,13 @@
|
||||
*/
|
||||
package org.apache.lucene.russian.morphology;
|
||||
|
||||
import org.apache.lucene.russian.morphology.informations.Heuristic;
|
||||
import org.apache.lucene.russian.morphology.informations.Morph;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
@ -34,12 +35,26 @@ public class Test {
|
||||
public static void main(String[] args) throws IOException, ClassNotFoundException {
|
||||
//
|
||||
Morph splitter = new Morph("sep.txt");
|
||||
InputStream stream = Test.class.getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt");
|
||||
BufferedReader stream1 = new BufferedReader(new InputStreamReader(stream));
|
||||
String s = stream1.readLine().trim().toLowerCase();
|
||||
for (String w : s.split(" ")) {
|
||||
System.out.println(splitter.getMorhInfo(w));
|
||||
TreeSet<Short> shorts = new TreeSet<Short>();
|
||||
int count = 0;
|
||||
TreeMap<Integer, Integer> rulesStat = new TreeMap<Integer, Integer>();
|
||||
for (Heuristic[] heuristics : splitter.getRules()) {
|
||||
Integer d = rulesStat.get(heuristics.length);
|
||||
rulesStat.put(heuristics.length, 1 + (d == null ? 0 : d));
|
||||
boolean flag = true;
|
||||
short actualSuffixLenght = heuristics[0].getActualSuffixLengh();
|
||||
String normalSuffix = heuristics[0].getActualNormalSuffix();
|
||||
for (Heuristic heuristic : heuristics) {
|
||||
flag = flag && (heuristic.getActualSuffixLengh() == actualSuffixLenght)
|
||||
&& normalSuffix.equals(heuristic.getActualNormalSuffix());
|
||||
}
|
||||
if (!flag) {
|
||||
System.out.println(Arrays.asList(heuristics));
|
||||
count++;
|
||||
}
|
||||
}
|
||||
System.out.println(count);
|
||||
System.out.println(rulesStat);
|
||||
System.gc();
|
||||
System.out.println("Ready");
|
||||
System.in.read();
|
||||
|
@ -1,32 +0,0 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.russian.morphology.informations;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
|
||||
public class GrammaInfo implements Serializable {
|
||||
private String[] grammaInfo;
|
||||
|
||||
public GrammaInfo(String[] grammaInfo) {
|
||||
this.grammaInfo = grammaInfo;
|
||||
}
|
||||
|
||||
public String getInfo(Integer index) {
|
||||
return grammaInfo[index];
|
||||
}
|
||||
}
|
@ -1,3 +1,18 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.russian.morphology.informations;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -1,3 +1,18 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.russian.morphology.informations;
|
||||
|
||||
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||
@ -28,6 +43,22 @@ public class Morph {
|
||||
this.grammaInfo = grammaInfo;
|
||||
}
|
||||
|
||||
public int[][] getSeparators() {
|
||||
return separators;
|
||||
}
|
||||
|
||||
public short[] getRulesId() {
|
||||
return rulesId;
|
||||
}
|
||||
|
||||
public Heuristic[][] getRules() {
|
||||
return rules;
|
||||
}
|
||||
|
||||
public String[] getGrammaInfo() {
|
||||
return grammaInfo;
|
||||
}
|
||||
|
||||
public List<String> getMorhInfo(String s) {
|
||||
ArrayList<String> result = new ArrayList<String>();
|
||||
int[] ints = RussianSuffixDecoderEncoder.encodeToArray(revertWord(s));
|
||||
@ -58,7 +89,7 @@ public class Morph {
|
||||
if (compareToInts(ints, separators[mid]) >= 0) {
|
||||
return mid;
|
||||
} else {
|
||||
return mid + 1;
|
||||
return mid - 1;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,16 +0,0 @@
|
||||
package org.apache.lucene.russian.morphology.informations;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
|
||||
public class RuleInfo implements Serializable {
|
||||
private Heuristic[][] rules;
|
||||
|
||||
public RuleInfo(Heuristic[][] rules) {
|
||||
this.rules = rules;
|
||||
}
|
||||
|
||||
public Heuristic[] getRule(short ruleId) {
|
||||
return rules[ruleId];
|
||||
}
|
||||
}
|
@ -1,3 +1,8 @@
|
||||
пушке А бутявка волит за напушкой Сяпала Калуша по напушке и увазила бутявку И волит Калушата калушаточки Бутявка Калушата присяпали и бутявку стрямкали И подудонились А Калуша волит Бутявка то некузявая Калушата бутявку вычучили Бутявка вздребезнулась сопритюкнулась и усяпала с напушки
|
||||
А Калуша волит:
|
||||
— Бутявок не трямкают. Бутявки дюбые и зюмо-зюмо некузявые. От бутявок дудонятся.
|
||||
А бутявка волит за напушкой:
|
||||
— Калушата подудонились! Калушата подудонились! Зюмо некузявые! Пуськи бятые!
|
||||
В условиях нарастающей пурги было сделано 4 успешных захода на посадку. "Все нормально, будем рекомендовать систему к внедрению".
|
||||
Рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются
|
||||
примерно один раз в две недели.
|
Loading…
x
Reference in New Issue
Block a user