working on new model
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@46 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
d9494de751
commit
c03babf709
@ -17,7 +17,10 @@ package org.apache.lucene.russian.morphology;
|
||||
|
||||
import org.apache.lucene.russian.morphology.informations.Morph;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
@ -31,6 +34,12 @@ public class Test {
|
||||
public static void main(String[] args) throws IOException, ClassNotFoundException {
|
||||
//
|
||||
Morph splitter = new Morph("sep.txt");
|
||||
InputStream stream = Test.class.getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt");
|
||||
BufferedReader stream1 = new BufferedReader(new InputStreamReader(stream));
|
||||
String s = stream1.readLine().trim().toLowerCase();
|
||||
for (String w : s.split(" ")) {
|
||||
System.out.println(splitter.getMorhInfo(w));
|
||||
}
|
||||
System.gc();
|
||||
System.out.println("Ready");
|
||||
System.in.read();
|
||||
|
@ -24,6 +24,26 @@ public class Heuristic implements Serializable {
|
||||
this.normalFormMorphInfo = normalFormMorphInfo;
|
||||
}
|
||||
|
||||
public String transofrmWord(String w) {
|
||||
return w.substring(0, w.length() - actualSuffixLengh) + actualNormalSuffix;
|
||||
}
|
||||
|
||||
public byte getActualSuffixLengh() {
|
||||
return actualSuffixLengh;
|
||||
}
|
||||
|
||||
public String getActualNormalSuffix() {
|
||||
return actualNormalSuffix;
|
||||
}
|
||||
|
||||
public short getFormMorphInfo() {
|
||||
return formMorphInfo;
|
||||
}
|
||||
|
||||
public short getNormalFormMorphInfo() {
|
||||
return normalFormMorphInfo;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
|
@ -1,9 +1,13 @@
|
||||
package org.apache.lucene.russian.morphology.informations;
|
||||
|
||||
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileReader;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class Morph {
|
||||
@ -24,6 +28,50 @@ public class Morph {
|
||||
this.grammaInfo = grammaInfo;
|
||||
}
|
||||
|
||||
public List<String> getMorhInfo(String s) {
|
||||
ArrayList<String> result = new ArrayList<String>();
|
||||
int[] ints = RussianSuffixDecoderEncoder.encodeToArray(revertWord(s));
|
||||
int ruleId = findRuleId(ints);
|
||||
for (Heuristic h : rules[rulesId[ruleId]]) {
|
||||
System.out.println(h);
|
||||
result.add(h.transofrmWord(s));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private int findRuleId(int[] ints) {
|
||||
int low = 0;
|
||||
int high = separators.length - 1;
|
||||
int mid = 0;
|
||||
while (low <= high) {
|
||||
mid = (low + high) >>> 1;
|
||||
int[] midVal = separators[mid];
|
||||
|
||||
int comResult = compareToInts(ints, midVal);
|
||||
if (comResult > 0)
|
||||
low = mid + 1;
|
||||
else if (comResult < 0)
|
||||
high = mid - 1;
|
||||
else
|
||||
break;
|
||||
}
|
||||
if (compareToInts(ints, separators[mid]) >= 0) {
|
||||
return mid;
|
||||
} else {
|
||||
return mid + 1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private int compareToInts(int[] i1, int[] i2) {
|
||||
int minLength = Math.min(i1.length, i2.length);
|
||||
for (int i = 0; i < minLength; i++) {
|
||||
int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1);
|
||||
if (i3 != 0) return i3;
|
||||
}
|
||||
return i2.length - i1.length;
|
||||
}
|
||||
|
||||
public void writeToFile(String fileName) throws IOException {
|
||||
FileWriter writer = new FileWriter(fileName);
|
||||
writer.write(separators.length + "\n");
|
||||
@ -87,4 +135,12 @@ public class Morph {
|
||||
}
|
||||
bufferedReader.close();
|
||||
}
|
||||
|
||||
private String revertWord(String s) {
|
||||
String result = "";
|
||||
for (int i = 1; i <= s.length(); i++) {
|
||||
result += s.charAt(s.length() - i);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
@ -15,15 +15,31 @@
|
||||
*/
|
||||
package org.apache.lucene.russian.morphology;
|
||||
|
||||
import org.apache.lucene.russian.morphology.informations.Morph;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
|
||||
public class SpeedTest {
|
||||
|
||||
@Test
|
||||
public void getTestOfSpeed() throws IOException {
|
||||
Morph splitter = new Morph("sep.txt");
|
||||
InputStream stream = Test.class.getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt");
|
||||
BufferedReader stream1 = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||
String s = stream1.readLine().trim().toLowerCase();
|
||||
for (String w : s.split(" ")) {
|
||||
try {
|
||||
System.out.println(w);
|
||||
System.out.println(splitter.getMorhInfo(w));
|
||||
} catch (WrongCharaterException e) {
|
||||
|
||||
}
|
||||
}
|
||||
// Long startTime = System.currentTimeMillis();
|
||||
// RussianMorphlogyAnalayzer morphlogyAnalayzer = new RussianMorphlogyAnalayzer();
|
||||
// System.out.println("To build analayzer take " + (System.currentTimeMillis() - startTime) + " ms.");
|
||||
|
Loading…
x
Reference in New Issue
Block a user