diff --git a/src/main/java/org/apache/lucene/russian/morphology/Test.java b/src/main/java/org/apache/lucene/russian/morphology/Test.java index e6c0f53..1406f69 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/Test.java +++ b/src/main/java/org/apache/lucene/russian/morphology/Test.java @@ -17,7 +17,10 @@ package org.apache.lucene.russian.morphology; import org.apache.lucene.russian.morphology.informations.Morph; +import java.io.BufferedReader; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; /** * Created by IntelliJ IDEA. @@ -31,6 +34,12 @@ public class Test { public static void main(String[] args) throws IOException, ClassNotFoundException { // Morph splitter = new Morph("sep.txt"); + InputStream stream = Test.class.getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt"); + BufferedReader stream1 = new BufferedReader(new InputStreamReader(stream)); + String s = stream1.readLine().trim().toLowerCase(); + for (String w : s.split(" ")) { + System.out.println(splitter.getMorhInfo(w)); + } System.gc(); System.out.println("Ready"); System.in.read(); diff --git a/src/main/java/org/apache/lucene/russian/morphology/informations/Heuristic.java b/src/main/java/org/apache/lucene/russian/morphology/informations/Heuristic.java index 8f0c89b..570b308 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/informations/Heuristic.java +++ b/src/main/java/org/apache/lucene/russian/morphology/informations/Heuristic.java @@ -24,6 +24,26 @@ public class Heuristic implements Serializable { this.normalFormMorphInfo = normalFormMorphInfo; } + public String transofrmWord(String w) { + return w.substring(0, w.length() - actualSuffixLengh) + actualNormalSuffix; + } + + public byte getActualSuffixLengh() { + return actualSuffixLengh; + } + + public String getActualNormalSuffix() { + return actualNormalSuffix; + } + + public short getFormMorphInfo() { + return formMorphInfo; + } + + public short getNormalFormMorphInfo() { + return normalFormMorphInfo; + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/src/main/java/org/apache/lucene/russian/morphology/informations/Morph.java b/src/main/java/org/apache/lucene/russian/morphology/informations/Morph.java index 2ac081e..b6dd5f9 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/informations/Morph.java +++ b/src/main/java/org/apache/lucene/russian/morphology/informations/Morph.java @@ -1,9 +1,13 @@ package org.apache.lucene.russian.morphology.informations; +import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; + import java.io.BufferedReader; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; public class Morph { @@ -24,6 +28,50 @@ public class Morph { this.grammaInfo = grammaInfo; } + public List getMorhInfo(String s) { + ArrayList result = new ArrayList(); + int[] ints = RussianSuffixDecoderEncoder.encodeToArray(revertWord(s)); + int ruleId = findRuleId(ints); + for (Heuristic h : rules[rulesId[ruleId]]) { + System.out.println(h); + result.add(h.transofrmWord(s)); + } + return result; + } + + private int findRuleId(int[] ints) { + int low = 0; + int high = separators.length - 1; + int mid = 0; + while (low <= high) { + mid = (low + high) >>> 1; + int[] midVal = separators[mid]; + + int comResult = compareToInts(ints, midVal); + if (comResult > 0) + low = mid + 1; + else if (comResult < 0) + high = mid - 1; + else + break; + } + if (compareToInts(ints, separators[mid]) >= 0) { + return mid; + } else { + return mid + 1; + } + + } + + private int compareToInts(int[] i1, int[] i2) { + int minLength = Math.min(i1.length, i2.length); + for (int i = 0; i < minLength; i++) { + int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1); + if (i3 != 0) return i3; + } + return i2.length - i1.length; + } + public void writeToFile(String fileName) throws IOException { FileWriter writer = new FileWriter(fileName); writer.write(separators.length + "\n"); @@ -87,4 +135,12 @@ public class Morph { } bufferedReader.close(); } + + private String revertWord(String s) { + String result = ""; + for (int i = 1; i <= s.length(); i++) { + result += s.charAt(s.length() - i); + } + return result; + } } diff --git a/src/test/java/org/apache/lucene/russian/morphology/SpeedTest.java b/src/test/java/org/apache/lucene/russian/morphology/SpeedTest.java index b840d5d..03382df 100644 --- a/src/test/java/org/apache/lucene/russian/morphology/SpeedTest.java +++ b/src/test/java/org/apache/lucene/russian/morphology/SpeedTest.java @@ -15,15 +15,31 @@ */ package org.apache.lucene.russian.morphology; +import org.apache.lucene.russian.morphology.informations.Morph; import org.junit.Test; +import java.io.BufferedReader; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; public class SpeedTest { @Test public void getTestOfSpeed() throws IOException { + Morph splitter = new Morph("sep.txt"); + InputStream stream = Test.class.getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt"); + BufferedReader stream1 = new BufferedReader(new InputStreamReader(stream, "UTF-8")); + String s = stream1.readLine().trim().toLowerCase(); + for (String w : s.split(" ")) { + try { + System.out.println(w); + System.out.println(splitter.getMorhInfo(w)); + } catch (WrongCharaterException e) { + + } + } // Long startTime = System.currentTimeMillis(); // RussianMorphlogyAnalayzer morphlogyAnalayzer = new RussianMorphlogyAnalayzer(); // System.out.println("To build analayzer take " + (System.currentTimeMillis() - startTime) + " ms.");