working on new model
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@47 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
c03babf709
commit
394fb6a621
@ -23,7 +23,6 @@ import org.apache.lucene.russian.morphology.dictonary.WordCard;
|
|||||||
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
|
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
|
||||||
import org.apache.lucene.russian.morphology.informations.Heuristic;
|
import org.apache.lucene.russian.morphology.informations.Heuristic;
|
||||||
import org.apache.lucene.russian.morphology.informations.Morph;
|
import org.apache.lucene.russian.morphology.informations.Morph;
|
||||||
import org.apache.lucene.russian.morphology.informations.RuleInfo;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
@ -137,7 +136,5 @@ public class StatiticsCollector implements WordProccessor {
|
|||||||
return length;
|
return length;
|
||||||
}
|
}
|
||||||
|
|
||||||
public RuleInfo getRuleInfo() {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -15,12 +15,13 @@
|
|||||||
*/
|
*/
|
||||||
package org.apache.lucene.russian.morphology;
|
package org.apache.lucene.russian.morphology;
|
||||||
|
|
||||||
|
import org.apache.lucene.russian.morphology.informations.Heuristic;
|
||||||
import org.apache.lucene.russian.morphology.informations.Morph;
|
import org.apache.lucene.russian.morphology.informations.Morph;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.util.Arrays;
|
||||||
import java.io.InputStreamReader;
|
import java.util.TreeMap;
|
||||||
|
import java.util.TreeSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Created by IntelliJ IDEA.
|
* Created by IntelliJ IDEA.
|
||||||
@ -34,12 +35,26 @@ public class Test {
|
|||||||
public static void main(String[] args) throws IOException, ClassNotFoundException {
|
public static void main(String[] args) throws IOException, ClassNotFoundException {
|
||||||
//
|
//
|
||||||
Morph splitter = new Morph("sep.txt");
|
Morph splitter = new Morph("sep.txt");
|
||||||
InputStream stream = Test.class.getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt");
|
TreeSet<Short> shorts = new TreeSet<Short>();
|
||||||
BufferedReader stream1 = new BufferedReader(new InputStreamReader(stream));
|
int count = 0;
|
||||||
String s = stream1.readLine().trim().toLowerCase();
|
TreeMap<Integer, Integer> rulesStat = new TreeMap<Integer, Integer>();
|
||||||
for (String w : s.split(" ")) {
|
for (Heuristic[] heuristics : splitter.getRules()) {
|
||||||
System.out.println(splitter.getMorhInfo(w));
|
Integer d = rulesStat.get(heuristics.length);
|
||||||
|
rulesStat.put(heuristics.length, 1 + (d == null ? 0 : d));
|
||||||
|
boolean flag = true;
|
||||||
|
short actualSuffixLenght = heuristics[0].getActualSuffixLengh();
|
||||||
|
String normalSuffix = heuristics[0].getActualNormalSuffix();
|
||||||
|
for (Heuristic heuristic : heuristics) {
|
||||||
|
flag = flag && (heuristic.getActualSuffixLengh() == actualSuffixLenght)
|
||||||
|
&& normalSuffix.equals(heuristic.getActualNormalSuffix());
|
||||||
|
}
|
||||||
|
if (!flag) {
|
||||||
|
System.out.println(Arrays.asList(heuristics));
|
||||||
|
count++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
System.out.println(count);
|
||||||
|
System.out.println(rulesStat);
|
||||||
System.gc();
|
System.gc();
|
||||||
System.out.println("Ready");
|
System.out.println("Ready");
|
||||||
System.in.read();
|
System.in.read();
|
||||||
|
@ -1,32 +0,0 @@
|
|||||||
/**
|
|
||||||
* Copyright 2009 Alexander Kuznetsov
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology.informations;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
|
|
||||||
public class GrammaInfo implements Serializable {
|
|
||||||
private String[] grammaInfo;
|
|
||||||
|
|
||||||
public GrammaInfo(String[] grammaInfo) {
|
|
||||||
this.grammaInfo = grammaInfo;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getInfo(Integer index) {
|
|
||||||
return grammaInfo[index];
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,3 +1,18 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
package org.apache.lucene.russian.morphology.informations;
|
package org.apache.lucene.russian.morphology.informations;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
@ -1,3 +1,18 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
package org.apache.lucene.russian.morphology.informations;
|
package org.apache.lucene.russian.morphology.informations;
|
||||||
|
|
||||||
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||||
@ -28,6 +43,22 @@ public class Morph {
|
|||||||
this.grammaInfo = grammaInfo;
|
this.grammaInfo = grammaInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int[][] getSeparators() {
|
||||||
|
return separators;
|
||||||
|
}
|
||||||
|
|
||||||
|
public short[] getRulesId() {
|
||||||
|
return rulesId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Heuristic[][] getRules() {
|
||||||
|
return rules;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String[] getGrammaInfo() {
|
||||||
|
return grammaInfo;
|
||||||
|
}
|
||||||
|
|
||||||
public List<String> getMorhInfo(String s) {
|
public List<String> getMorhInfo(String s) {
|
||||||
ArrayList<String> result = new ArrayList<String>();
|
ArrayList<String> result = new ArrayList<String>();
|
||||||
int[] ints = RussianSuffixDecoderEncoder.encodeToArray(revertWord(s));
|
int[] ints = RussianSuffixDecoderEncoder.encodeToArray(revertWord(s));
|
||||||
@ -58,7 +89,7 @@ public class Morph {
|
|||||||
if (compareToInts(ints, separators[mid]) >= 0) {
|
if (compareToInts(ints, separators[mid]) >= 0) {
|
||||||
return mid;
|
return mid;
|
||||||
} else {
|
} else {
|
||||||
return mid + 1;
|
return mid - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,16 +0,0 @@
|
|||||||
package org.apache.lucene.russian.morphology.informations;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
|
|
||||||
public class RuleInfo implements Serializable {
|
|
||||||
private Heuristic[][] rules;
|
|
||||||
|
|
||||||
public RuleInfo(Heuristic[][] rules) {
|
|
||||||
this.rules = rules;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Heuristic[] getRule(short ruleId) {
|
|
||||||
return rules[ruleId];
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,3 +1,8 @@
|
|||||||
|
пушке А бутявка волит за напушкой Сяпала Калуша по напушке и увазила бутявку И волит Калушата калушаточки Бутявка Калушата присяпали и бутявку стрямкали И подудонились А Калуша волит Бутявка то некузявая Калушата бутявку вычучили Бутявка вздребезнулась сопритюкнулась и усяпала с напушки
|
||||||
|
А Калуша волит:
|
||||||
|
— Бутявок не трямкают. Бутявки дюбые и зюмо-зюмо некузявые. От бутявок дудонятся.
|
||||||
|
А бутявка волит за напушкой:
|
||||||
|
— Калушата подудонились! Калушата подудонились! Зюмо некузявые! Пуськи бятые!
|
||||||
В условиях нарастающей пурги было сделано 4 успешных захода на посадку. "Все нормально, будем рекомендовать систему к внедрению".
|
В условиях нарастающей пурги было сделано 4 успешных захода на посадку. "Все нормально, будем рекомендовать систему к внедрению".
|
||||||
Рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются
|
Рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются
|
||||||
примерно один раз в две недели.
|
примерно один раз в две недели.
|
Loading…
x
Reference in New Issue
Block a user