some prefomance turting
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@122 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
43136e0de1
commit
d46651f2ba
@ -16,7 +16,7 @@
|
|||||||
|
|
||||||
package org.apache.lucene.morphology.generator;
|
package org.apache.lucene.morphology.generator;
|
||||||
|
|
||||||
import org.apache.lucene.morphology.EnglishLetterDecoderEncoder;
|
import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder;
|
||||||
import org.apache.lucene.morphology.dictionary.*;
|
import org.apache.lucene.morphology.dictionary.*;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -17,6 +17,9 @@ package org.apache.lucene;
|
|||||||
|
|
||||||
import org.apache.lucene.morphology.*;
|
import org.apache.lucene.morphology.*;
|
||||||
import org.apache.lucene.morphology.dictionary.*;
|
import org.apache.lucene.morphology.dictionary.*;
|
||||||
|
import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder;
|
||||||
|
import org.apache.lucene.morphology.english.EnglishLuceneMorphology;
|
||||||
|
import org.apache.lucene.morphology.english.EnglishMorphology;
|
||||||
import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
|
import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
|
||||||
import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
|
import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
|
||||||
import org.apache.lucene.morphology.russian.RussianMorphology;
|
import org.apache.lucene.morphology.russian.RussianMorphology;
|
||||||
@ -46,7 +49,7 @@ public class TestAllWords {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void shouldEnglishMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException {
|
public void shouldEnglishMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException {
|
||||||
final Morphology morphology = new EnglishMorphology();
|
final MorphologyImpl morphology = new EnglishMorphology();
|
||||||
LetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
LetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
||||||
String pathToGramma = prefix + "dictonary/Dicts/Morph/egramtab.tab";
|
String pathToGramma = prefix + "dictonary/Dicts/Morph/egramtab.tab";
|
||||||
String pathToDict = prefix + "dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd";
|
String pathToDict = prefix + "dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd";
|
||||||
@ -57,7 +60,7 @@ public class TestAllWords {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void shouldRussianMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException {
|
public void shouldRussianMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException {
|
||||||
final Morphology morphology = new RussianMorphology();
|
final MorphologyImpl morphology = new RussianMorphology();
|
||||||
LetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
LetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
||||||
String pathToGramma = prefix + "dictonary/Dicts/Morph/rgramtab.tab";
|
String pathToGramma = prefix + "dictonary/Dicts/Morph/rgramtab.tab";
|
||||||
String pathToDict = prefix + "dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd";
|
String pathToDict = prefix + "dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd";
|
||||||
@ -65,7 +68,7 @@ public class TestAllWords {
|
|||||||
testFullGramma(morphology, decoderEncoder, pathToGramma, pathToDict);
|
testFullGramma(morphology, decoderEncoder, pathToGramma, pathToDict);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void testFullGramma(final Morphology morphology, LetterDecoderEncoder decoderEncoder, String pathToGramma, String pathToDict) throws IOException {
|
private void testFullGramma(final MorphologyImpl morphology, LetterDecoderEncoder decoderEncoder, String pathToGramma, String pathToDict) throws IOException {
|
||||||
GrammarReader grammarInfo = new GrammarReader(pathToGramma);
|
GrammarReader grammarInfo = new GrammarReader(pathToGramma);
|
||||||
final List<String> morphInfo = grammarInfo.getGrammarInfo();
|
final List<String> morphInfo = grammarInfo.getGrammarInfo();
|
||||||
final Map<String, Integer> inversIndex = grammarInfo.getGrammarInverseIndex();
|
final Map<String, Integer> inversIndex = grammarInfo.getGrammarInverseIndex();
|
||||||
@ -92,7 +95,6 @@ public class TestAllWords {
|
|||||||
WordStringCleaner wordStringCleaner = new WordStringCleaner(decoderEncoder, wordCleaner);
|
WordStringCleaner wordStringCleaner = new WordStringCleaner(decoderEncoder, wordCleaner);
|
||||||
RemoveFlexiaWithPrefixes removeFlexiaWithPrefixes = new RemoveFlexiaWithPrefixes(wordStringCleaner);
|
RemoveFlexiaWithPrefixes removeFlexiaWithPrefixes = new RemoveFlexiaWithPrefixes(wordStringCleaner);
|
||||||
dictionaryReader.process(removeFlexiaWithPrefixes);
|
dictionaryReader.process(removeFlexiaWithPrefixes);
|
||||||
|
|
||||||
long time = System.currentTimeMillis() - startTime;
|
long time = System.currentTimeMillis() - startTime;
|
||||||
System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second");
|
System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second");
|
||||||
}
|
}
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
package org.apache.lucene.morphology;
|
package org.apache.lucene.morphology;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.morphology.english.EnglishAnalyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.morphology.russian.RussianAnalyzer;
|
import org.apache.lucene.morphology.russian.RussianAnalyzer;
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
package org.apache.lucene.morphology;
|
package org.apache.lucene.morphology;
|
||||||
|
|
||||||
import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
|
import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
|
||||||
|
import org.apache.lucene.morphology.english.EnglishLuceneMorphology;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
|
@ -1,29 +0,0 @@
|
|||||||
/**
|
|
||||||
* Copyright 2009 Alexander Kuznetsov
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.morphology;
|
|
||||||
|
|
||||||
import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
|
|
||||||
public class EnglishAnalyzer extends MorphologyAnalyzer {
|
|
||||||
|
|
||||||
public EnglishAnalyzer() throws IOException {
|
|
||||||
super(new EnglishLuceneMorphology());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,111 +0,0 @@
|
|||||||
/**
|
|
||||||
* Copyright 2009 Alexander Kuznetsov
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.morphology;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
|
|
||||||
|
|
||||||
public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder {
|
|
||||||
public static final int ENGLISH_SMALL_LETTER_OFFSET = 96;
|
|
||||||
static public int SUFFIX_LENGTH = 6;
|
|
||||||
public static final int DASH_CHAR = 45;
|
|
||||||
public static final int DASH_CODE = 27;
|
|
||||||
|
|
||||||
public Integer encode(String string) {
|
|
||||||
if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12);
|
|
||||||
int result = 0;
|
|
||||||
for (int i = 0; i < string.length(); i++) {
|
|
||||||
int c = 0 + string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET;
|
|
||||||
if (c == 45 - ENGLISH_SMALL_LETTER_OFFSET) {
|
|
||||||
c = DASH_CODE;
|
|
||||||
}
|
|
||||||
if (c < 0 || c > 27)
|
|
||||||
throw new WrongCharaterException("Symblo " + string.charAt(i) + " is not small cirillic letter");
|
|
||||||
result = result * 28 + c;
|
|
||||||
}
|
|
||||||
for (int i = string.length(); i < 6; i++) {
|
|
||||||
result *= 28;
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int[] encodeToArray(String s) {
|
|
||||||
|
|
||||||
ArrayList<Integer> integers = new ArrayList<Integer>();
|
|
||||||
while (s.length() > 6) {
|
|
||||||
integers.add(encode(s.substring(0, 6)));
|
|
||||||
s = s.substring(6);
|
|
||||||
}
|
|
||||||
integers.add(encode(s));
|
|
||||||
int[] ints = new int[integers.size()];
|
|
||||||
int pos = 0;
|
|
||||||
for (Integer i : integers) {
|
|
||||||
ints[pos] = i;
|
|
||||||
pos++;
|
|
||||||
}
|
|
||||||
return ints;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String decodeArray(int[] array) {
|
|
||||||
String result = "";
|
|
||||||
for (int i : array) {
|
|
||||||
result += decode(i);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public String decode(Integer suffixN) {
|
|
||||||
String result = "";
|
|
||||||
while (suffixN > 27) {
|
|
||||||
int c = suffixN % 28 + ENGLISH_SMALL_LETTER_OFFSET;
|
|
||||||
if (c == ENGLISH_SMALL_LETTER_OFFSET) {
|
|
||||||
suffixN /= 28;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
|
|
||||||
result = (char) c + result;
|
|
||||||
suffixN /= 28;
|
|
||||||
}
|
|
||||||
long c = suffixN + ENGLISH_SMALL_LETTER_OFFSET;
|
|
||||||
if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
|
|
||||||
result = (char) c + result;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean checkCharacter(char c) {
|
|
||||||
int code = 0 + c;
|
|
||||||
if (code == 45) return true;
|
|
||||||
code -= ENGLISH_SMALL_LETTER_OFFSET;
|
|
||||||
if (code > 0 && code < 27) return true;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean checkString(String word) {
|
|
||||||
for (int i = 0; i < word.length(); i++) {
|
|
||||||
if (!checkCharacter(word.charAt(i))) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String cleanString(String s) {
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,26 +0,0 @@
|
|||||||
/**
|
|
||||||
* Copyright 2009 Alexander Kuznetsov
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.morphology;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
|
|
||||||
public class EnglishLuceneMorphology extends LuceneMorphology {
|
|
||||||
|
|
||||||
public EnglishLuceneMorphology() throws IOException {
|
|
||||||
super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,26 +0,0 @@
|
|||||||
/**
|
|
||||||
* Copyright 2009 Alexander Kuznetsov
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.morphology;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
|
|
||||||
public class EnglishMorphology extends MorphologyImpl {
|
|
||||||
|
|
||||||
public EnglishMorphology() throws IOException {
|
|
||||||
super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
|
|
||||||
}
|
|
||||||
}
|
|
@ -39,9 +39,9 @@ public class Heuristic implements Serializable {
|
|||||||
this.normalFormMorphInfo = normalFormMorphInfo;
|
this.normalFormMorphInfo = normalFormMorphInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String transformWord(String w) {
|
public StringBuilder transformWord(String w) {
|
||||||
if (w.length() - actualSuffixLength < 0) return w;
|
if (w.length() - actualSuffixLength < 0) return new StringBuilder(w);
|
||||||
return w.substring(0, w.length() - actualSuffixLength) + actualNormalSuffix;
|
return new StringBuilder(w.substring(0, w.length() - actualSuffixLength)).append(actualNormalSuffix);
|
||||||
}
|
}
|
||||||
|
|
||||||
public byte getActualSuffixLength() {
|
public byte getActualSuffixLength() {
|
||||||
|
@ -51,7 +51,7 @@ public class MorphologyImpl implements Morphology {
|
|||||||
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
|
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
|
||||||
int ruleId = findRuleId(ints);
|
int ruleId = findRuleId(ints);
|
||||||
for (Heuristic h : rules[rulesId[ruleId]]) {
|
for (Heuristic h : rules[rulesId[ruleId]]) {
|
||||||
result.add(h.transformWord(s));
|
result.add(h.transformWord(s).toString());
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
@ -61,7 +61,7 @@ public class MorphologyImpl implements Morphology {
|
|||||||
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
|
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
|
||||||
int ruleId = findRuleId(ints);
|
int ruleId = findRuleId(ints);
|
||||||
for (Heuristic h : rules[rulesId[ruleId]]) {
|
for (Heuristic h : rules[rulesId[ruleId]]) {
|
||||||
result.add(h.transformWord(s) + "|" + grammarInfo[h.getFormMorphInfo()]);
|
result.add(h.transformWord(s).append("|").append(grammarInfo[h.getFormMorphInfo()]).toString());
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
@ -192,10 +192,10 @@ public class MorphologyImpl implements Morphology {
|
|||||||
}
|
}
|
||||||
|
|
||||||
protected String revertWord(String s) {
|
protected String revertWord(String s) {
|
||||||
String result = "";
|
StringBuilder result = new StringBuilder();
|
||||||
for (int i = 1; i <= s.length(); i++) {
|
for (int i = 1; i <= s.length(); i++) {
|
||||||
result += s.charAt(s.length() - i);
|
result.append(s.charAt(s.length() - i));
|
||||||
}
|
}
|
||||||
return result;
|
return result.toString();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -21,6 +21,7 @@ import org.apache.lucene.morphology.SuffixToLongException;
|
|||||||
import org.apache.lucene.morphology.WrongCharaterException;
|
import org.apache.lucene.morphology.WrongCharaterException;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This helper class allow encode suffix of russian word
|
* This helper class allow encode suffix of russian word
|
||||||
@ -57,7 +58,7 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int[] encodeToArray(String s) {
|
public int[] encodeToArray(String s) {
|
||||||
ArrayList<Integer> integers = new ArrayList<Integer>();
|
LinkedList<Integer> integers = new LinkedList<Integer>();
|
||||||
while (s.length() > WORD_PART_LENGHT) {
|
while (s.length() > WORD_PART_LENGHT) {
|
||||||
integers.add(encode(s.substring(0, WORD_PART_LENGHT)));
|
integers.add(encode(s.substring(0, WORD_PART_LENGHT)));
|
||||||
s = s.substring(WORD_PART_LENGHT);
|
s = s.substring(WORD_PART_LENGHT);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user