diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java index f8d9709..db65d15 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java @@ -16,7 +16,7 @@ package org.apache.lucene.morphology.generator; -import org.apache.lucene.morphology.EnglishLetterDecoderEncoder; +import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder; import org.apache.lucene.morphology.dictionary.*; import java.io.IOException; diff --git a/dictionary-reader/src/test/java/org/apache/lucene/TestAllWords.java b/dictionary-reader/src/test/java/org/apache/lucene/TestAllWords.java index bf7bc89..242a073 100644 --- a/dictionary-reader/src/test/java/org/apache/lucene/TestAllWords.java +++ b/dictionary-reader/src/test/java/org/apache/lucene/TestAllWords.java @@ -17,6 +17,9 @@ package org.apache.lucene; import org.apache.lucene.morphology.*; import org.apache.lucene.morphology.dictionary.*; +import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder; +import org.apache.lucene.morphology.english.EnglishLuceneMorphology; +import org.apache.lucene.morphology.english.EnglishMorphology; import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder; import org.apache.lucene.morphology.russian.RussianLuceneMorphology; import org.apache.lucene.morphology.russian.RussianMorphology; @@ -46,7 +49,7 @@ public class TestAllWords { @Test public void shouldEnglishMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException { - final Morphology morphology = new EnglishMorphology(); + final MorphologyImpl morphology = new EnglishMorphology(); LetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder(); String pathToGramma = prefix + "dictonary/Dicts/Morph/egramtab.tab"; String pathToDict = prefix + "dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd"; @@ -57,7 +60,7 @@ public class TestAllWords { @Test public void shouldRussianMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException { - final Morphology morphology = new RussianMorphology(); + final MorphologyImpl morphology = new RussianMorphology(); LetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder(); String pathToGramma = prefix + "dictonary/Dicts/Morph/rgramtab.tab"; String pathToDict = prefix + "dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd"; @@ -65,7 +68,7 @@ public class TestAllWords { testFullGramma(morphology, decoderEncoder, pathToGramma, pathToDict); } - private void testFullGramma(final Morphology morphology, LetterDecoderEncoder decoderEncoder, String pathToGramma, String pathToDict) throws IOException { + private void testFullGramma(final MorphologyImpl morphology, LetterDecoderEncoder decoderEncoder, String pathToGramma, String pathToDict) throws IOException { GrammarReader grammarInfo = new GrammarReader(pathToGramma); final List morphInfo = grammarInfo.getGrammarInfo(); final Map inversIndex = grammarInfo.getGrammarInverseIndex(); @@ -92,7 +95,6 @@ public class TestAllWords { WordStringCleaner wordStringCleaner = new WordStringCleaner(decoderEncoder, wordCleaner); RemoveFlexiaWithPrefixes removeFlexiaWithPrefixes = new RemoveFlexiaWithPrefixes(wordStringCleaner); dictionaryReader.process(removeFlexiaWithPrefixes); - long time = System.currentTimeMillis() - startTime; System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second"); } diff --git a/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java b/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java index 430e275..3cb2794 100644 --- a/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java +++ b/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java @@ -16,6 +16,7 @@ package org.apache.lucene.morphology; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.morphology.english.EnglishAnalyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.morphology.russian.RussianAnalyzer; diff --git a/dictionary-reader/src/test/java/org/apache/lucene/morphology/LuceneMorphTest.java b/dictionary-reader/src/test/java/org/apache/lucene/morphology/LuceneMorphTest.java index eb91c2c..2faff21 100644 --- a/dictionary-reader/src/test/java/org/apache/lucene/morphology/LuceneMorphTest.java +++ b/dictionary-reader/src/test/java/org/apache/lucene/morphology/LuceneMorphTest.java @@ -16,6 +16,7 @@ package org.apache.lucene.morphology; import org.apache.lucene.morphology.russian.RussianLuceneMorphology; +import org.apache.lucene.morphology.english.EnglishLuceneMorphology; import org.junit.Test; import java.io.BufferedReader; diff --git a/english/src/main/java/org/apache/lucene/morphology/EnglishAnalyzer.java b/english/src/main/java/org/apache/lucene/morphology/EnglishAnalyzer.java deleted file mode 100644 index 50f40da..0000000 --- a/english/src/main/java/org/apache/lucene/morphology/EnglishAnalyzer.java +++ /dev/null @@ -1,29 +0,0 @@ -/** - * Copyright 2009 Alexander Kuznetsov - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.morphology; - -import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer; - -import java.io.IOException; - - -public class EnglishAnalyzer extends MorphologyAnalyzer { - - public EnglishAnalyzer() throws IOException { - super(new EnglishLuceneMorphology()); - } - -} \ No newline at end of file diff --git a/english/src/main/java/org/apache/lucene/morphology/EnglishLetterDecoderEncoder.java b/english/src/main/java/org/apache/lucene/morphology/EnglishLetterDecoderEncoder.java deleted file mode 100644 index c808b72..0000000 --- a/english/src/main/java/org/apache/lucene/morphology/EnglishLetterDecoderEncoder.java +++ /dev/null @@ -1,111 +0,0 @@ -/** - * Copyright 2009 Alexander Kuznetsov - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.morphology; - -import java.util.ArrayList; - - -public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder { - public static final int ENGLISH_SMALL_LETTER_OFFSET = 96; - static public int SUFFIX_LENGTH = 6; - public static final int DASH_CHAR = 45; - public static final int DASH_CODE = 27; - - public Integer encode(String string) { - if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12); - int result = 0; - for (int i = 0; i < string.length(); i++) { - int c = 0 + string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET; - if (c == 45 - ENGLISH_SMALL_LETTER_OFFSET) { - c = DASH_CODE; - } - if (c < 0 || c > 27) - throw new WrongCharaterException("Symblo " + string.charAt(i) + " is not small cirillic letter"); - result = result * 28 + c; - } - for (int i = string.length(); i < 6; i++) { - result *= 28; - } - return result; - } - - public int[] encodeToArray(String s) { - - ArrayList integers = new ArrayList(); - while (s.length() > 6) { - integers.add(encode(s.substring(0, 6))); - s = s.substring(6); - } - integers.add(encode(s)); - int[] ints = new int[integers.size()]; - int pos = 0; - for (Integer i : integers) { - ints[pos] = i; - pos++; - } - return ints; - } - - public String decodeArray(int[] array) { - String result = ""; - for (int i : array) { - result += decode(i); - } - return result; - } - - - public String decode(Integer suffixN) { - String result = ""; - while (suffixN > 27) { - int c = suffixN % 28 + ENGLISH_SMALL_LETTER_OFFSET; - if (c == ENGLISH_SMALL_LETTER_OFFSET) { - suffixN /= 28; - continue; - } - if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR; - result = (char) c + result; - suffixN /= 28; - } - long c = suffixN + ENGLISH_SMALL_LETTER_OFFSET; - if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR; - result = (char) c + result; - return result; - } - - public boolean checkCharacter(char c) { - int code = 0 + c; - if (code == 45) return true; - code -= ENGLISH_SMALL_LETTER_OFFSET; - if (code > 0 && code < 27) return true; - return false; - } - - - public boolean checkString(String word) { - for (int i = 0; i < word.length(); i++) { - if (!checkCharacter(word.charAt(i))) { - return false; - } - } - return true; - } - - public String cleanString(String s) { - return s; - } - -} diff --git a/english/src/main/java/org/apache/lucene/morphology/EnglishLuceneMorphology.java b/english/src/main/java/org/apache/lucene/morphology/EnglishLuceneMorphology.java deleted file mode 100644 index b2c3f31..0000000 --- a/english/src/main/java/org/apache/lucene/morphology/EnglishLuceneMorphology.java +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Copyright 2009 Alexander Kuznetsov - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.morphology; - -import java.io.IOException; - - -public class EnglishLuceneMorphology extends LuceneMorphology { - - public EnglishLuceneMorphology() throws IOException { - super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder()); - } -} \ No newline at end of file diff --git a/english/src/main/java/org/apache/lucene/morphology/EnglishMorphology.java b/english/src/main/java/org/apache/lucene/morphology/EnglishMorphology.java deleted file mode 100644 index e3e9e29..0000000 --- a/english/src/main/java/org/apache/lucene/morphology/EnglishMorphology.java +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Copyright 2009 Alexander Kuznetsov - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.morphology; - -import java.io.IOException; - - -public class EnglishMorphology extends MorphologyImpl { - - public EnglishMorphology() throws IOException { - super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder()); - } -} diff --git a/morph/src/main/java/org/apache/lucene/morphology/Heuristic.java b/morph/src/main/java/org/apache/lucene/morphology/Heuristic.java index 4335f30..be8ea82 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/Heuristic.java +++ b/morph/src/main/java/org/apache/lucene/morphology/Heuristic.java @@ -39,9 +39,9 @@ public class Heuristic implements Serializable { this.normalFormMorphInfo = normalFormMorphInfo; } - public String transformWord(String w) { - if (w.length() - actualSuffixLength < 0) return w; - return w.substring(0, w.length() - actualSuffixLength) + actualNormalSuffix; + public StringBuilder transformWord(String w) { + if (w.length() - actualSuffixLength < 0) return new StringBuilder(w); + return new StringBuilder(w.substring(0, w.length() - actualSuffixLength)).append(actualNormalSuffix); } public byte getActualSuffixLength() { diff --git a/morph/src/main/java/org/apache/lucene/morphology/MorphologyImpl.java b/morph/src/main/java/org/apache/lucene/morphology/MorphologyImpl.java index ff966b6..b1e7580 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/MorphologyImpl.java +++ b/morph/src/main/java/org/apache/lucene/morphology/MorphologyImpl.java @@ -51,7 +51,7 @@ public class MorphologyImpl implements Morphology { int[] ints = decoderEncoder.encodeToArray(revertWord(s)); int ruleId = findRuleId(ints); for (Heuristic h : rules[rulesId[ruleId]]) { - result.add(h.transformWord(s)); + result.add(h.transformWord(s).toString()); } return result; } @@ -61,7 +61,7 @@ public class MorphologyImpl implements Morphology { int[] ints = decoderEncoder.encodeToArray(revertWord(s)); int ruleId = findRuleId(ints); for (Heuristic h : rules[rulesId[ruleId]]) { - result.add(h.transformWord(s) + "|" + grammarInfo[h.getFormMorphInfo()]); + result.add(h.transformWord(s).append("|").append(grammarInfo[h.getFormMorphInfo()]).toString()); } return result; } @@ -192,10 +192,10 @@ public class MorphologyImpl implements Morphology { } protected String revertWord(String s) { - String result = ""; + StringBuilder result = new StringBuilder(); for (int i = 1; i <= s.length(); i++) { - result += s.charAt(s.length() - i); + result.append(s.charAt(s.length() - i)); } - return result; + return result.toString(); } } diff --git a/russian/src/main/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoder.java b/russian/src/main/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoder.java index e73b8d7..75b6bf7 100644 --- a/russian/src/main/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoder.java +++ b/russian/src/main/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoder.java @@ -21,6 +21,7 @@ import org.apache.lucene.morphology.SuffixToLongException; import org.apache.lucene.morphology.WrongCharaterException; import java.util.ArrayList; +import java.util.LinkedList; /** * This helper class allow encode suffix of russian word @@ -57,7 +58,7 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { } public int[] encodeToArray(String s) { - ArrayList integers = new ArrayList(); + LinkedList integers = new LinkedList(); while (s.length() > WORD_PART_LENGHT) { integers.add(encode(s.substring(0, WORD_PART_LENGHT))); s = s.substring(WORD_PART_LENGHT);