some prefomance turting

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@122 d817d54c-26ab-11de-abc9-2f7d1455ff7a
2010-10-18 12:24:50 +00:00
parent 43136e0de1
commit d46651f2ba
11 changed files with 19 additions and 206 deletions
@@ -16,7 +16,7 @@
 package org.apache.lucene.morphology.generator;
-import org.apache.lucene.morphology.EnglishLetterDecoderEncoder;
+import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder;
 import org.apache.lucene.morphology.dictionary.*;
 import java.io.IOException;
@@ -17,6 +17,9 @@ package org.apache.lucene;
 import org.apache.lucene.morphology.*;
 import org.apache.lucene.morphology.dictionary.*;
 import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder;
 import org.apache.lucene.morphology.english.EnglishLuceneMorphology;
 import org.apache.lucene.morphology.english.EnglishMorphology;
 import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
 import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
 import org.apache.lucene.morphology.russian.RussianMorphology;
@@ -46,7 +49,7 @@ public class TestAllWords {
    @Test
    public void shouldEnglishMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException {
-        final Morphology morphology = new EnglishMorphology();
+        final MorphologyImpl morphology = new EnglishMorphology();
        LetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
        String pathToGramma = prefix + "dictonary/Dicts/Morph/egramtab.tab";
        String pathToDict = prefix + "dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd";
@@ -57,7 +60,7 @@ public class TestAllWords {
    @Test
    public void shouldRussianMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException {
-        final Morphology morphology = new RussianMorphology();
+        final MorphologyImpl morphology = new RussianMorphology();
        LetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
        String pathToGramma = prefix + "dictonary/Dicts/Morph/rgramtab.tab";
        String pathToDict = prefix + "dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd";
@@ -65,7 +68,7 @@ public class TestAllWords {
        testFullGramma(morphology, decoderEncoder, pathToGramma, pathToDict);
    }
-    private void testFullGramma(final Morphology morphology, LetterDecoderEncoder decoderEncoder, String pathToGramma, String pathToDict) throws IOException {
+    private void testFullGramma(final MorphologyImpl morphology, LetterDecoderEncoder decoderEncoder, String pathToGramma, String pathToDict) throws IOException {
        GrammarReader grammarInfo = new GrammarReader(pathToGramma);
        final List<String> morphInfo = grammarInfo.getGrammarInfo();
        final Map<String, Integer> inversIndex = grammarInfo.getGrammarInverseIndex();
@@ -92,7 +95,6 @@ public class TestAllWords {
        WordStringCleaner wordStringCleaner = new WordStringCleaner(decoderEncoder, wordCleaner);
        RemoveFlexiaWithPrefixes removeFlexiaWithPrefixes = new RemoveFlexiaWithPrefixes(wordStringCleaner);
        dictionaryReader.process(removeFlexiaWithPrefixes);
        long time = System.currentTimeMillis() - startTime;
        System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second");
    }
@@ -16,6 +16,7 @@
 package org.apache.lucene.morphology;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.morphology.english.EnglishAnalyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.morphology.russian.RussianAnalyzer;
@@ -16,6 +16,7 @@
 package org.apache.lucene.morphology;
 import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
 import org.apache.lucene.morphology.english.EnglishLuceneMorphology;
 import org.junit.Test;
 import java.io.BufferedReader;
@@ -1,29 +0,0 @@
 /**
 * Copyright 2009 Alexander Kuznetsov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.morphology;
 import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer;
 import java.io.IOException;
 public class EnglishAnalyzer extends MorphologyAnalyzer {
    public EnglishAnalyzer() throws IOException {
        super(new EnglishLuceneMorphology());
    }
 }
@@ -1,111 +0,0 @@
 /**
 * Copyright 2009 Alexander Kuznetsov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.morphology;
 import java.util.ArrayList;
 public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder {
    public static final int ENGLISH_SMALL_LETTER_OFFSET = 96;
    static public int SUFFIX_LENGTH = 6;
    public static final int DASH_CHAR = 45;
    public static final int DASH_CODE = 27;
    public Integer encode(String string) {
        if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12);
        int result = 0;
        for (int i = 0; i < string.length(); i++) {
            int c = 0 + string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET;
            if (c == 45 - ENGLISH_SMALL_LETTER_OFFSET) {
                c = DASH_CODE;
            }
            if (c < 0 || c > 27)
                throw new WrongCharaterException("Symblo " + string.charAt(i) + " is not small cirillic letter");
            result = result * 28 + c;
        }
        for (int i = string.length(); i < 6; i++) {
            result *= 28;
        }
        return result;
    }
    public int[] encodeToArray(String s) {
        ArrayList<Integer> integers = new ArrayList<Integer>();
        while (s.length() > 6) {
            integers.add(encode(s.substring(0, 6)));
            s = s.substring(6);
        }
        integers.add(encode(s));
        int[] ints = new int[integers.size()];
        int pos = 0;
        for (Integer i : integers) {
            ints[pos] = i;
            pos++;
        }
        return ints;
    }
    public String decodeArray(int[] array) {
        String result = "";
        for (int i : array) {
            result += decode(i);
        }
        return result;
    }
    public String decode(Integer suffixN) {
        String result = "";
        while (suffixN > 27) {
            int c = suffixN % 28 + ENGLISH_SMALL_LETTER_OFFSET;
            if (c == ENGLISH_SMALL_LETTER_OFFSET) {
                suffixN /= 28;
                continue;
            }
            if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
            result = (char) c + result;
            suffixN /= 28;
        }
        long c = suffixN + ENGLISH_SMALL_LETTER_OFFSET;
        if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
        result = (char) c + result;
        return result;
    }
    public boolean checkCharacter(char c) {
        int code = 0 + c;
        if (code == 45) return true;
        code -= ENGLISH_SMALL_LETTER_OFFSET;
        if (code > 0 && code < 27) return true;
        return false;
    }
    public boolean checkString(String word) {
        for (int i = 0; i < word.length(); i++) {
            if (!checkCharacter(word.charAt(i))) {
                return false;
            }
        }
        return true;
    }
    public String cleanString(String s) {
        return s;
    }
 }
@@ -1,26 +0,0 @@
 /**
 * Copyright 2009 Alexander Kuznetsov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.morphology;
 import java.io.IOException;
 public class EnglishLuceneMorphology extends LuceneMorphology {
    public EnglishLuceneMorphology() throws IOException {
        super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
    }
 }
@@ -1,26 +0,0 @@
 /**
 * Copyright 2009 Alexander Kuznetsov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.morphology;
 import java.io.IOException;
 public class EnglishMorphology extends MorphologyImpl {
    public EnglishMorphology() throws IOException {
        super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
    }
 }
@@ -39,9 +39,9 @@ public class Heuristic implements Serializable {
        this.normalFormMorphInfo = normalFormMorphInfo;
    }
-    public String transformWord(String w) {
+    public StringBuilder transformWord(String w) {
-        if (w.length() - actualSuffixLength < 0) return w;
+        if (w.length() - actualSuffixLength < 0) return new StringBuilder(w);
-        return w.substring(0, w.length() - actualSuffixLength) + actualNormalSuffix;
+        return new StringBuilder(w.substring(0, w.length() - actualSuffixLength)).append(actualNormalSuffix);
    }
    public byte getActualSuffixLength() {
@@ -51,7 +51,7 @@ public class MorphologyImpl implements Morphology {
        int[] ints = decoderEncoder.encodeToArray(revertWord(s));
        int ruleId = findRuleId(ints);
        for (Heuristic h : rules[rulesId[ruleId]]) {
-            result.add(h.transformWord(s));
+            result.add(h.transformWord(s).toString());
        }
        return result;
    }
@@ -61,7 +61,7 @@ public class MorphologyImpl implements Morphology {
        int[] ints = decoderEncoder.encodeToArray(revertWord(s));
        int ruleId = findRuleId(ints);
        for (Heuristic h : rules[rulesId[ruleId]]) {
-            result.add(h.transformWord(s) + "|" + grammarInfo[h.getFormMorphInfo()]);
+            result.add(h.transformWord(s).append("|").append(grammarInfo[h.getFormMorphInfo()]).toString());
        }
        return result;
    }
@@ -192,10 +192,10 @@ public class MorphologyImpl implements Morphology {
    }
    protected String revertWord(String s) {
-        String result = "";
+        StringBuilder result = new StringBuilder();
        for (int i = 1; i <= s.length(); i++) {
-            result += s.charAt(s.length() - i);
+            result.append(s.charAt(s.length() - i));
        }
-        return result;
+        return result.toString();
    }
 }
@@ -21,6 +21,7 @@ import org.apache.lucene.morphology.SuffixToLongException;
 import org.apache.lucene.morphology.WrongCharaterException;
 import java.util.ArrayList;
 import java.util.LinkedList;
 /**
 * This helper class allow encode suffix of russian word
@@ -57,7 +58,7 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
    }
    public int[] encodeToArray(String s) {
-        ArrayList<Integer> integers = new ArrayList<Integer>();
+        LinkedList<Integer> integers = new LinkedList<Integer>();
        while (s.length() > WORD_PART_LENGHT) {
            integers.add(encode(s.substring(0, WORD_PART_LENGHT)));
            s = s.substring(WORD_PART_LENGHT);