working on new model
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@46 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
d9494de751
commit
c03babf709
@ -17,7 +17,10 @@ package org.apache.lucene.russian.morphology;
|
|||||||
|
|
||||||
import org.apache.lucene.russian.morphology.informations.Morph;
|
import org.apache.lucene.russian.morphology.informations.Morph;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Created by IntelliJ IDEA.
|
* Created by IntelliJ IDEA.
|
||||||
@ -31,6 +34,12 @@ public class Test {
|
|||||||
public static void main(String[] args) throws IOException, ClassNotFoundException {
|
public static void main(String[] args) throws IOException, ClassNotFoundException {
|
||||||
//
|
//
|
||||||
Morph splitter = new Morph("sep.txt");
|
Morph splitter = new Morph("sep.txt");
|
||||||
|
InputStream stream = Test.class.getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt");
|
||||||
|
BufferedReader stream1 = new BufferedReader(new InputStreamReader(stream));
|
||||||
|
String s = stream1.readLine().trim().toLowerCase();
|
||||||
|
for (String w : s.split(" ")) {
|
||||||
|
System.out.println(splitter.getMorhInfo(w));
|
||||||
|
}
|
||||||
System.gc();
|
System.gc();
|
||||||
System.out.println("Ready");
|
System.out.println("Ready");
|
||||||
System.in.read();
|
System.in.read();
|
||||||
|
@ -24,6 +24,26 @@ public class Heuristic implements Serializable {
|
|||||||
this.normalFormMorphInfo = normalFormMorphInfo;
|
this.normalFormMorphInfo = normalFormMorphInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String transofrmWord(String w) {
|
||||||
|
return w.substring(0, w.length() - actualSuffixLengh) + actualNormalSuffix;
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte getActualSuffixLengh() {
|
||||||
|
return actualSuffixLengh;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getActualNormalSuffix() {
|
||||||
|
return actualNormalSuffix;
|
||||||
|
}
|
||||||
|
|
||||||
|
public short getFormMorphInfo() {
|
||||||
|
return formMorphInfo;
|
||||||
|
}
|
||||||
|
|
||||||
|
public short getNormalFormMorphInfo() {
|
||||||
|
return normalFormMorphInfo;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object o) {
|
public boolean equals(Object o) {
|
||||||
if (this == o) return true;
|
if (this == o) return true;
|
||||||
|
@ -1,9 +1,13 @@
|
|||||||
package org.apache.lucene.russian.morphology.informations;
|
package org.apache.lucene.russian.morphology.informations;
|
||||||
|
|
||||||
|
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
import java.io.FileWriter;
|
import java.io.FileWriter;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
public class Morph {
|
public class Morph {
|
||||||
@ -24,6 +28,50 @@ public class Morph {
|
|||||||
this.grammaInfo = grammaInfo;
|
this.grammaInfo = grammaInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public List<String> getMorhInfo(String s) {
|
||||||
|
ArrayList<String> result = new ArrayList<String>();
|
||||||
|
int[] ints = RussianSuffixDecoderEncoder.encodeToArray(revertWord(s));
|
||||||
|
int ruleId = findRuleId(ints);
|
||||||
|
for (Heuristic h : rules[rulesId[ruleId]]) {
|
||||||
|
System.out.println(h);
|
||||||
|
result.add(h.transofrmWord(s));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int findRuleId(int[] ints) {
|
||||||
|
int low = 0;
|
||||||
|
int high = separators.length - 1;
|
||||||
|
int mid = 0;
|
||||||
|
while (low <= high) {
|
||||||
|
mid = (low + high) >>> 1;
|
||||||
|
int[] midVal = separators[mid];
|
||||||
|
|
||||||
|
int comResult = compareToInts(ints, midVal);
|
||||||
|
if (comResult > 0)
|
||||||
|
low = mid + 1;
|
||||||
|
else if (comResult < 0)
|
||||||
|
high = mid - 1;
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (compareToInts(ints, separators[mid]) >= 0) {
|
||||||
|
return mid;
|
||||||
|
} else {
|
||||||
|
return mid + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private int compareToInts(int[] i1, int[] i2) {
|
||||||
|
int minLength = Math.min(i1.length, i2.length);
|
||||||
|
for (int i = 0; i < minLength; i++) {
|
||||||
|
int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1);
|
||||||
|
if (i3 != 0) return i3;
|
||||||
|
}
|
||||||
|
return i2.length - i1.length;
|
||||||
|
}
|
||||||
|
|
||||||
public void writeToFile(String fileName) throws IOException {
|
public void writeToFile(String fileName) throws IOException {
|
||||||
FileWriter writer = new FileWriter(fileName);
|
FileWriter writer = new FileWriter(fileName);
|
||||||
writer.write(separators.length + "\n");
|
writer.write(separators.length + "\n");
|
||||||
@ -87,4 +135,12 @@ public class Morph {
|
|||||||
}
|
}
|
||||||
bufferedReader.close();
|
bufferedReader.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String revertWord(String s) {
|
||||||
|
String result = "";
|
||||||
|
for (int i = 1; i <= s.length(); i++) {
|
||||||
|
result += s.charAt(s.length() - i);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -15,15 +15,31 @@
|
|||||||
*/
|
*/
|
||||||
package org.apache.lucene.russian.morphology;
|
package org.apache.lucene.russian.morphology;
|
||||||
|
|
||||||
|
import org.apache.lucene.russian.morphology.informations.Morph;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
|
||||||
|
|
||||||
public class SpeedTest {
|
public class SpeedTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void getTestOfSpeed() throws IOException {
|
public void getTestOfSpeed() throws IOException {
|
||||||
|
Morph splitter = new Morph("sep.txt");
|
||||||
|
InputStream stream = Test.class.getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt");
|
||||||
|
BufferedReader stream1 = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||||
|
String s = stream1.readLine().trim().toLowerCase();
|
||||||
|
for (String w : s.split(" ")) {
|
||||||
|
try {
|
||||||
|
System.out.println(w);
|
||||||
|
System.out.println(splitter.getMorhInfo(w));
|
||||||
|
} catch (WrongCharaterException e) {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
// Long startTime = System.currentTimeMillis();
|
// Long startTime = System.currentTimeMillis();
|
||||||
// RussianMorphlogyAnalayzer morphlogyAnalayzer = new RussianMorphlogyAnalayzer();
|
// RussianMorphlogyAnalayzer morphlogyAnalayzer = new RussianMorphlogyAnalayzer();
|
||||||
// System.out.println("To build analayzer take " + (System.currentTimeMillis() - startTime) + " ms.");
|
// System.out.println("To build analayzer take " + (System.currentTimeMillis() - startTime) + " ms.");
|
||||||
|
Loading…
x
Reference in New Issue
Block a user