working on new model

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@46 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov 2009-08-16 22:50:15 +00:00
parent d9494de751
commit c03babf709
4 changed files with 101 additions and 0 deletions

View File

@ -17,7 +17,10 @@ package org.apache.lucene.russian.morphology;
import org.apache.lucene.russian.morphology.informations.Morph;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
/**
* Created by IntelliJ IDEA.
@ -31,6 +34,12 @@ public class Test {
public static void main(String[] args) throws IOException, ClassNotFoundException {
//
Morph splitter = new Morph("sep.txt");
InputStream stream = Test.class.getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt");
BufferedReader stream1 = new BufferedReader(new InputStreamReader(stream));
String s = stream1.readLine().trim().toLowerCase();
for (String w : s.split(" ")) {
System.out.println(splitter.getMorhInfo(w));
}
System.gc();
System.out.println("Ready");
System.in.read();

View File

@ -24,6 +24,26 @@ public class Heuristic implements Serializable {
this.normalFormMorphInfo = normalFormMorphInfo;
}
public String transofrmWord(String w) {
return w.substring(0, w.length() - actualSuffixLengh) + actualNormalSuffix;
}
public byte getActualSuffixLengh() {
return actualSuffixLengh;
}
public String getActualNormalSuffix() {
return actualNormalSuffix;
}
public short getFormMorphInfo() {
return formMorphInfo;
}
public short getNormalFormMorphInfo() {
return normalFormMorphInfo;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;

View File

@ -1,9 +1,13 @@
package org.apache.lucene.russian.morphology.informations;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class Morph {
@ -24,6 +28,50 @@ public class Morph {
this.grammaInfo = grammaInfo;
}
public List<String> getMorhInfo(String s) {
ArrayList<String> result = new ArrayList<String>();
int[] ints = RussianSuffixDecoderEncoder.encodeToArray(revertWord(s));
int ruleId = findRuleId(ints);
for (Heuristic h : rules[rulesId[ruleId]]) {
System.out.println(h);
result.add(h.transofrmWord(s));
}
return result;
}
private int findRuleId(int[] ints) {
int low = 0;
int high = separators.length - 1;
int mid = 0;
while (low <= high) {
mid = (low + high) >>> 1;
int[] midVal = separators[mid];
int comResult = compareToInts(ints, midVal);
if (comResult > 0)
low = mid + 1;
else if (comResult < 0)
high = mid - 1;
else
break;
}
if (compareToInts(ints, separators[mid]) >= 0) {
return mid;
} else {
return mid + 1;
}
}
private int compareToInts(int[] i1, int[] i2) {
int minLength = Math.min(i1.length, i2.length);
for (int i = 0; i < minLength; i++) {
int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1);
if (i3 != 0) return i3;
}
return i2.length - i1.length;
}
public void writeToFile(String fileName) throws IOException {
FileWriter writer = new FileWriter(fileName);
writer.write(separators.length + "\n");
@ -87,4 +135,12 @@ public class Morph {
}
bufferedReader.close();
}
private String revertWord(String s) {
String result = "";
for (int i = 1; i <= s.length(); i++) {
result += s.charAt(s.length() - i);
}
return result;
}
}

View File

@ -15,15 +15,31 @@
*/
package org.apache.lucene.russian.morphology;
import org.apache.lucene.russian.morphology.informations.Morph;
import org.junit.Test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
public class SpeedTest {
@Test
public void getTestOfSpeed() throws IOException {
Morph splitter = new Morph("sep.txt");
InputStream stream = Test.class.getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt");
BufferedReader stream1 = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
String s = stream1.readLine().trim().toLowerCase();
for (String w : s.split(" ")) {
try {
System.out.println(w);
System.out.println(splitter.getMorhInfo(w));
} catch (WrongCharaterException e) {
}
}
// Long startTime = System.currentTimeMillis();
// RussianMorphlogyAnalayzer morphlogyAnalayzer = new RussianMorphlogyAnalayzer();
// System.out.println("To build analayzer take " + (System.currentTimeMillis() - startTime) + " ms.");