adding name and middel name to russian morphology
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@53 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
@ -16,20 +16,21 @@
|
||||
|
||||
package org.apache.lucene.morphology.russian;
|
||||
|
||||
import org.apache.lucene.morpholgy.dictionary.*;
|
||||
import org.apache.lucene.morpholgy.dictionary.DictonaryReader;
|
||||
import org.apache.lucene.morpholgy.dictionary.GrammaReader;
|
||||
import org.apache.lucene.morpholgy.dictionary.StatiticsCollector;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
|
||||
|
||||
public class HeuristicBuilder {
|
||||
public static void main(String[] args) throws IOException {
|
||||
IgnoredFormReader formReader = new IgnoredFormReader("data/igoredFrom.txt");
|
||||
Set<String> form = formReader.getIngnoredFroms();
|
||||
//IgnoredFormReader formReader = new IgnoredFormReader("data/igoredFrom.txt");
|
||||
//Set<String> form = formReader.getIngnoredFroms();
|
||||
|
||||
FrequentyReader frequentyReader = new FrequentyReader("data/lemma.num");
|
||||
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
|
||||
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form);
|
||||
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
|
||||
|
||||
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
||||
StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo, decoderEncoder);
|
||||
|
@ -16,13 +16,11 @@
|
||||
package org.apache.lucene.morphology.russian;
|
||||
|
||||
|
||||
import org.apache.lucene.morphology.Heuristic;
|
||||
import org.apache.lucene.morphology.Morph;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
@ -35,27 +33,16 @@ public class Test {
|
||||
|
||||
public static void main(String[] args) throws IOException, ClassNotFoundException {
|
||||
//
|
||||
Morph splitter = new Morph("sep.txt",new RussianLetterDecoderEncoder());
|
||||
TreeSet<Short> shorts = new TreeSet<Short>();
|
||||
int count = 0;
|
||||
TreeMap<Integer, Integer> rulesStat = new TreeMap<Integer, Integer>();
|
||||
for (Heuristic[] heuristics : splitter.getRules()) {
|
||||
Integer d = rulesStat.get(heuristics.length);
|
||||
rulesStat.put(heuristics.length, 1 + (d == null ? 0 : d));
|
||||
boolean flag = true;
|
||||
short actualSuffixLenght = heuristics[0].getActualSuffixLengh();
|
||||
String normalSuffix = heuristics[0].getActualNormalSuffix();
|
||||
for (Heuristic heuristic : heuristics) {
|
||||
flag = flag && (heuristic.getActualSuffixLengh() == actualSuffixLenght)
|
||||
&& normalSuffix.equals(heuristic.getActualNormalSuffix());
|
||||
}
|
||||
if (!flag) {
|
||||
System.out.println(Arrays.asList(heuristics));
|
||||
count++;
|
||||
}
|
||||
Morph splitter = new Morph("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info", new RussianLetterDecoderEncoder());
|
||||
FileReader fileReader = new FileReader("russian/src/main/resources/for.test.txt");
|
||||
BufferedReader bufferedReader = new BufferedReader(fileReader);
|
||||
String s = bufferedReader.readLine();
|
||||
while (s != null) {
|
||||
System.out.println(splitter.getMorhInfo(s));
|
||||
s = bufferedReader.readLine();
|
||||
}
|
||||
System.out.println(count);
|
||||
System.out.println(rulesStat);
|
||||
|
||||
fileReader.close();
|
||||
System.gc();
|
||||
System.in.read();
|
||||
}
|
||||
|
22
russian/src/main/resources/for.test.txt
Normal file
22
russian/src/main/resources/for.test.txt
Normal file
@ -0,0 +1,22 @@
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD>
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,13 @@
|
||||
тест тест
|
||||
ёж еж
|
||||
естера естера
|
||||
что-то что-то
|
||||
а а
|
||||
яяяяяя яяяяяя
|
||||
яяяя яяяя
|
||||
аа аа
|
||||
аааааа аааааа
|
||||
аааааааааааа аааааааааааа
|
||||
аааааааааааааааааа аааааааааааааааааа
|
||||
ааааааааааааааааа ааааааааааааааааа
|
||||
йфячыцувс йфячыцувс
|
@ -0,0 +1,8 @@
|
||||
тест тест
|
||||
ёж еж
|
||||
естера естера
|
||||
что-то что-то
|
||||
а а
|
||||
яяяяяя яяяяяя
|
||||
яяяя яяяя
|
||||
аа аа
|
@ -0,0 +1,9 @@
|
||||
<EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
test test
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
Reference in New Issue
Block a user