some prefomance turting

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@122 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
Alexander.A.Kuznetsov 2010-10-18 12:24:50 +00:00
parent 43136e0de1
commit d46651f2ba
11 changed files with 19 additions and 206 deletions

View File

@ -16,7 +16,7 @@
package org.apache.lucene.morphology.generator; package org.apache.lucene.morphology.generator;
import org.apache.lucene.morphology.EnglishLetterDecoderEncoder; import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder;
import org.apache.lucene.morphology.dictionary.*; import org.apache.lucene.morphology.dictionary.*;
import java.io.IOException; import java.io.IOException;

View File

@ -17,6 +17,9 @@ package org.apache.lucene;
import org.apache.lucene.morphology.*; import org.apache.lucene.morphology.*;
import org.apache.lucene.morphology.dictionary.*; import org.apache.lucene.morphology.dictionary.*;
import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder;
import org.apache.lucene.morphology.english.EnglishLuceneMorphology;
import org.apache.lucene.morphology.english.EnglishMorphology;
import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder; import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
import org.apache.lucene.morphology.russian.RussianLuceneMorphology; import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
import org.apache.lucene.morphology.russian.RussianMorphology; import org.apache.lucene.morphology.russian.RussianMorphology;
@ -46,7 +49,7 @@ public class TestAllWords {
@Test @Test
public void shouldEnglishMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException { public void shouldEnglishMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException {
final Morphology morphology = new EnglishMorphology(); final MorphologyImpl morphology = new EnglishMorphology();
LetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder(); LetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
String pathToGramma = prefix + "dictonary/Dicts/Morph/egramtab.tab"; String pathToGramma = prefix + "dictonary/Dicts/Morph/egramtab.tab";
String pathToDict = prefix + "dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd"; String pathToDict = prefix + "dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd";
@ -57,7 +60,7 @@ public class TestAllWords {
@Test @Test
public void shouldRussianMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException { public void shouldRussianMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException {
final Morphology morphology = new RussianMorphology(); final MorphologyImpl morphology = new RussianMorphology();
LetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder(); LetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
String pathToGramma = prefix + "dictonary/Dicts/Morph/rgramtab.tab"; String pathToGramma = prefix + "dictonary/Dicts/Morph/rgramtab.tab";
String pathToDict = prefix + "dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd"; String pathToDict = prefix + "dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd";
@ -65,7 +68,7 @@ public class TestAllWords {
testFullGramma(morphology, decoderEncoder, pathToGramma, pathToDict); testFullGramma(morphology, decoderEncoder, pathToGramma, pathToDict);
} }
private void testFullGramma(final Morphology morphology, LetterDecoderEncoder decoderEncoder, String pathToGramma, String pathToDict) throws IOException { private void testFullGramma(final MorphologyImpl morphology, LetterDecoderEncoder decoderEncoder, String pathToGramma, String pathToDict) throws IOException {
GrammarReader grammarInfo = new GrammarReader(pathToGramma); GrammarReader grammarInfo = new GrammarReader(pathToGramma);
final List<String> morphInfo = grammarInfo.getGrammarInfo(); final List<String> morphInfo = grammarInfo.getGrammarInfo();
final Map<String, Integer> inversIndex = grammarInfo.getGrammarInverseIndex(); final Map<String, Integer> inversIndex = grammarInfo.getGrammarInverseIndex();
@ -92,7 +95,6 @@ public class TestAllWords {
WordStringCleaner wordStringCleaner = new WordStringCleaner(decoderEncoder, wordCleaner); WordStringCleaner wordStringCleaner = new WordStringCleaner(decoderEncoder, wordCleaner);
RemoveFlexiaWithPrefixes removeFlexiaWithPrefixes = new RemoveFlexiaWithPrefixes(wordStringCleaner); RemoveFlexiaWithPrefixes removeFlexiaWithPrefixes = new RemoveFlexiaWithPrefixes(wordStringCleaner);
dictionaryReader.process(removeFlexiaWithPrefixes); dictionaryReader.process(removeFlexiaWithPrefixes);
long time = System.currentTimeMillis() - startTime; long time = System.currentTimeMillis() - startTime;
System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second"); System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second");
} }

View File

@ -16,6 +16,7 @@
package org.apache.lucene.morphology; package org.apache.lucene.morphology;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.morphology.english.EnglishAnalyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.morphology.russian.RussianAnalyzer; import org.apache.lucene.morphology.russian.RussianAnalyzer;

View File

@ -16,6 +16,7 @@
package org.apache.lucene.morphology; package org.apache.lucene.morphology;
import org.apache.lucene.morphology.russian.RussianLuceneMorphology; import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
import org.apache.lucene.morphology.english.EnglishLuceneMorphology;
import org.junit.Test; import org.junit.Test;
import java.io.BufferedReader; import java.io.BufferedReader;

View File

@ -1,29 +0,0 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer;
import java.io.IOException;
public class EnglishAnalyzer extends MorphologyAnalyzer {
public EnglishAnalyzer() throws IOException {
super(new EnglishLuceneMorphology());
}
}

View File

@ -1,111 +0,0 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import java.util.ArrayList;
public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder {
public static final int ENGLISH_SMALL_LETTER_OFFSET = 96;
static public int SUFFIX_LENGTH = 6;
public static final int DASH_CHAR = 45;
public static final int DASH_CODE = 27;
public Integer encode(String string) {
if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12);
int result = 0;
for (int i = 0; i < string.length(); i++) {
int c = 0 + string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET;
if (c == 45 - ENGLISH_SMALL_LETTER_OFFSET) {
c = DASH_CODE;
}
if (c < 0 || c > 27)
throw new WrongCharaterException("Symblo " + string.charAt(i) + " is not small cirillic letter");
result = result * 28 + c;
}
for (int i = string.length(); i < 6; i++) {
result *= 28;
}
return result;
}
public int[] encodeToArray(String s) {
ArrayList<Integer> integers = new ArrayList<Integer>();
while (s.length() > 6) {
integers.add(encode(s.substring(0, 6)));
s = s.substring(6);
}
integers.add(encode(s));
int[] ints = new int[integers.size()];
int pos = 0;
for (Integer i : integers) {
ints[pos] = i;
pos++;
}
return ints;
}
public String decodeArray(int[] array) {
String result = "";
for (int i : array) {
result += decode(i);
}
return result;
}
public String decode(Integer suffixN) {
String result = "";
while (suffixN > 27) {
int c = suffixN % 28 + ENGLISH_SMALL_LETTER_OFFSET;
if (c == ENGLISH_SMALL_LETTER_OFFSET) {
suffixN /= 28;
continue;
}
if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
result = (char) c + result;
suffixN /= 28;
}
long c = suffixN + ENGLISH_SMALL_LETTER_OFFSET;
if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
result = (char) c + result;
return result;
}
public boolean checkCharacter(char c) {
int code = 0 + c;
if (code == 45) return true;
code -= ENGLISH_SMALL_LETTER_OFFSET;
if (code > 0 && code < 27) return true;
return false;
}
public boolean checkString(String word) {
for (int i = 0; i < word.length(); i++) {
if (!checkCharacter(word.charAt(i))) {
return false;
}
}
return true;
}
public String cleanString(String s) {
return s;
}
}

View File

@ -1,26 +0,0 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import java.io.IOException;
public class EnglishLuceneMorphology extends LuceneMorphology {
public EnglishLuceneMorphology() throws IOException {
super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
}
}

View File

@ -1,26 +0,0 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import java.io.IOException;
public class EnglishMorphology extends MorphologyImpl {
public EnglishMorphology() throws IOException {
super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
}
}

View File

@ -39,9 +39,9 @@ public class Heuristic implements Serializable {
this.normalFormMorphInfo = normalFormMorphInfo; this.normalFormMorphInfo = normalFormMorphInfo;
} }
public String transformWord(String w) { public StringBuilder transformWord(String w) {
if (w.length() - actualSuffixLength < 0) return w; if (w.length() - actualSuffixLength < 0) return new StringBuilder(w);
return w.substring(0, w.length() - actualSuffixLength) + actualNormalSuffix; return new StringBuilder(w.substring(0, w.length() - actualSuffixLength)).append(actualNormalSuffix);
} }
public byte getActualSuffixLength() { public byte getActualSuffixLength() {

View File

@ -51,7 +51,7 @@ public class MorphologyImpl implements Morphology {
int[] ints = decoderEncoder.encodeToArray(revertWord(s)); int[] ints = decoderEncoder.encodeToArray(revertWord(s));
int ruleId = findRuleId(ints); int ruleId = findRuleId(ints);
for (Heuristic h : rules[rulesId[ruleId]]) { for (Heuristic h : rules[rulesId[ruleId]]) {
result.add(h.transformWord(s)); result.add(h.transformWord(s).toString());
} }
return result; return result;
} }
@ -61,7 +61,7 @@ public class MorphologyImpl implements Morphology {
int[] ints = decoderEncoder.encodeToArray(revertWord(s)); int[] ints = decoderEncoder.encodeToArray(revertWord(s));
int ruleId = findRuleId(ints); int ruleId = findRuleId(ints);
for (Heuristic h : rules[rulesId[ruleId]]) { for (Heuristic h : rules[rulesId[ruleId]]) {
result.add(h.transformWord(s) + "|" + grammarInfo[h.getFormMorphInfo()]); result.add(h.transformWord(s).append("|").append(grammarInfo[h.getFormMorphInfo()]).toString());
} }
return result; return result;
} }
@ -192,10 +192,10 @@ public class MorphologyImpl implements Morphology {
} }
protected String revertWord(String s) { protected String revertWord(String s) {
String result = ""; StringBuilder result = new StringBuilder();
for (int i = 1; i <= s.length(); i++) { for (int i = 1; i <= s.length(); i++) {
result += s.charAt(s.length() - i); result.append(s.charAt(s.length() - i));
} }
return result; return result.toString();
} }
} }

View File

@ -21,6 +21,7 @@ import org.apache.lucene.morphology.SuffixToLongException;
import org.apache.lucene.morphology.WrongCharaterException; import org.apache.lucene.morphology.WrongCharaterException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.LinkedList;
/** /**
* This helper class allow encode suffix of russian word * This helper class allow encode suffix of russian word
@ -57,7 +58,7 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
} }
public int[] encodeToArray(String s) { public int[] encodeToArray(String s) {
ArrayList<Integer> integers = new ArrayList<Integer>(); LinkedList<Integer> integers = new LinkedList<Integer>();
while (s.length() > WORD_PART_LENGHT) { while (s.length() > WORD_PART_LENGHT) {
integers.add(encode(s.substring(0, WORD_PART_LENGHT))); integers.add(encode(s.substring(0, WORD_PART_LENGHT)));
s = s.substring(WORD_PART_LENGHT); s = s.substring(WORD_PART_LENGHT);