diff --git a/dictionary-reader/pom.xml b/dictionary-reader/pom.xml index 400b6b9..153e720 100644 --- a/dictionary-reader/pom.xml +++ b/dictionary-reader/pom.xml @@ -1,5 +1,6 @@ - + morphology org.apache.lucene.morphology @@ -26,4 +27,6 @@ 0.9-SNAPSHOT + + diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictionaryReader.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictionaryReader.java index b22528b..280beda 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictionaryReader.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictionaryReader.java @@ -34,16 +34,19 @@ public class DictionaryReader { private List> wordsFlexias = new ArrayList>(); private List> wordPrefixes = new ArrayList>(); private Set ignoredForm = new HashSet(); + private List filters = new ArrayList(); - public DictionaryReader(String fileName, Set ignoredForm) { + public DictionaryReader(String fileName, Set ignoredForm, List filters) { this.fileName = fileName; this.ignoredForm = ignoredForm; + this.filters = filters; } - public DictionaryReader(String fileName, String fileEncoding, Set ignoredForm) { + public DictionaryReader(String fileName, String fileEncoding, Set ignoredForm, List filters) { this.fileName = fileName; this.fileEncoding = fileEncoding; this.ignoredForm = ignoredForm; + this.filters = filters; } @@ -60,30 +63,46 @@ public class DictionaryReader { private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException { String s = reader.readLine(); int count = Integer.valueOf(s); + int actual = 0; for (int i = 0; i < count; i++) { s = reader.readLine(); if (i % 10000 == 0) System.out.println("Proccess " + i + " wordBase of " + count); - String[] wd = s.split(" "); - String wordBase = wd[0].toLowerCase(); - if (wordBase.startsWith("-")) continue; - wordBase = "#".equals(wordBase) ? "" : wordBase; - List models = wordsFlexias.get(Integer.valueOf(wd[1])); - FlexiaModel flexiaModel = models.get(0); - if (models.size() > 0 && !ignoredForm.contains(flexiaModel.getCode())) { - - WordCard card = new WordCard(flexiaModel.create(wordBase), wordBase, flexiaModel.getSuffix()); - for (FlexiaModel fm : models) { - card.addFlexia(fm); - } -// if(card.getBase().equals("face") || card.getBase().equals("fac")){ -// System.out.println(models); -// System.out.println(card); - wordProccessor.process(card); - //} + WordCard card = buildForm(s); + for (WordFilter wf : filters) { + if (card == null) break; + card = wf.transform(card); } + + if (card == null) { + continue; + } + + wordProccessor.process(card); + actual++; + } + System.out.println("Finished word processing actual words " + actual); + } + + private WordCard buildForm(String s) { + String[] wd = s.split(" "); + String wordBase = wd[0].toLowerCase(); + if (wordBase.startsWith("-")) return null; + wordBase = "#".equals(wordBase) ? "" : wordBase; + List models = wordsFlexias.get(Integer.valueOf(wd[1])); + FlexiaModel flexiaModel = models.get(0); + if (models.size() == 0 || ignoredForm.contains(flexiaModel.getCode())) { + return null; + } + + WordCard card = new WordCard(flexiaModel.create(wordBase), wordBase, flexiaModel.getSuffix()); + + for (FlexiaModel fm : models) { + card.addFlexia(fm); + } + return card; } @@ -122,7 +141,7 @@ public class DictionaryReader { String[] fl = line.split("\\*"); // we inored all forms thats if (fl.length == 3) { - System.out.println(line); + //System.out.println(line); // flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase()))); } if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), "")); diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java index b4591cf..6906a61 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java @@ -66,4 +66,26 @@ public class FlexiaModel { ", prefix='" + prefix + '\'' + '}'; } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + FlexiaModel that = (FlexiaModel) o; + + if (code != null ? !code.equals(that.code) : that.code != null) return false; + if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false; + if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false; + + return true; + } + + @Override + public int hashCode() { + int result = code != null ? code.hashCode() : 0; + result = 31 * result + (suffix != null ? suffix.hashCode() : 0); + result = 31 * result + (prefix != null ? prefix.hashCode() : 0); + return result; + } } diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/StatisticsCollector.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/StatisticsCollector.java index a803d2c..a9fef93 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/StatisticsCollector.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/StatisticsCollector.java @@ -42,12 +42,8 @@ public class StatisticsCollector implements WordProccessor { public void process(WordCard wordCard) throws IOException { cleanWordCard(wordCard); String normalStringMorph = wordCard.getWordsForms().get(0).getCode(); - String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); - if (word.contains("-")) return; - if (!decoderEncoder.checkString(word)) return; for (FlexiaModel fm : wordCard.getWordsForms()) { - if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) continue; Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph); String form = revertWord(fm.create(wordCard.getBase())); Set suffixHeuristics = inverseIndex.get(form); @@ -138,8 +134,8 @@ public class StatisticsCollector implements WordProccessor { Integer length = getCommonLength(form, normalForm); Integer actualSuffixLengh = form.length() - length; String actualNormalSuffix = normalForm.substring(length); - Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode().substring(0, 2)); - Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm.substring(0, 2)); + Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode()); + Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm); return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue()); } diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCard.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCard.java index 0906acd..7d10229 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCard.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCard.java @@ -38,6 +38,10 @@ public class WordCard { wordsForms.add(flexiaModel); } + public void removeFlexia(FlexiaModel flexiaModel) { + wordsForms.remove(flexiaModel); + } + public String getCanonicalForm() { return canonicalForm; } diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCleaner.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCleaner.java new file mode 100644 index 0000000..1f16bed --- /dev/null +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCleaner.java @@ -0,0 +1,50 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.morphology.dictionary; + +import org.apache.lucene.morphology.LetterDecoderEncoder; + +import java.util.LinkedList; +import java.util.List; + + +public class WordCleaner implements WordFilter { + + private LetterDecoderEncoder decoderEncoder; + + public WordCleaner(LetterDecoderEncoder decoderEncoder) { + this.decoderEncoder = decoderEncoder; + } + + public WordCard transform(WordCard wordCard) { + String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); + + if (word.contains("-")) return null; + if (!decoderEncoder.checkString(word)) return null; + + List flexiaModelsToRemove = new LinkedList(); + for (FlexiaModel fm : wordCard.getWordsForms()) { + if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) { + flexiaModelsToRemove.add(fm); + } + } + for (FlexiaModel fm : flexiaModelsToRemove) { + wordCard.removeFlexia(fm); + } + + return wordCard; + } +} diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordFilter.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordFilter.java new file mode 100644 index 0000000..33ea89b --- /dev/null +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordFilter.java @@ -0,0 +1,24 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.morphology.dictionary; + + +public interface WordFilter { + + public WordCard transform(WordCard wordCard); + +} diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordStringCleaner.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordStringCleaner.java new file mode 100644 index 0000000..12a1eb2 --- /dev/null +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordStringCleaner.java @@ -0,0 +1,49 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.morphology.dictionary; + +import org.apache.lucene.morphology.LetterDecoderEncoder; + +import java.util.List; + + +public class WordStringCleaner implements WordFilter { + + private LetterDecoderEncoder decoderEncoder; + + public WordStringCleaner(LetterDecoderEncoder decoderEncoder) { + this.decoderEncoder = decoderEncoder; + } + + public WordCard transform(WordCard wordCard) { + wordCard.setBase(cleanString(wordCard.getBase())); + wordCard.setCanonicalForm(cleanString(wordCard.getCanonicalForm())); + wordCard.setCanonicalSuffix(cleanString(wordCard.getCanonicalSuffix())); + List models = wordCard.getWordsForms(); + for (FlexiaModel m : models) { + m.setSuffix(cleanString(m.getSuffix())); + m.setPrefix(cleanString(m.getPrefix())); + //made correct code + m.setCode(m.getCode().substring(0, 2)); + } + return wordCard; + } + + + private String cleanString(String s) { + return decoderEncoder.cleanString(s); + } +} diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java index 63c60f2..6cd4b9b 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java @@ -16,22 +16,24 @@ package org.apache.lucene.morphology.generator; -import org.apache.lucene.morphology.dictionary.DictionaryReader; -import org.apache.lucene.morphology.dictionary.GrammaReader; -import org.apache.lucene.morphology.dictionary.StatisticsCollector; -import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder; +import org.apache.lucene.morphology.EnglishLetterDecoderEncoder; +import org.apache.lucene.morphology.dictionary.*; import java.io.IOException; +import java.util.Arrays; import java.util.HashSet; +import java.util.List; public class EnglishHeuristicBuilder { public static void main(String[] args) throws IOException { GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/egramtab.tab"); - DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet()); - EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder(); + List filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder)); + + DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet(), filters); + StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder); dictionaryReader.proccess(statisticsCollector); statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info"); diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java index 2b896c9..aadbde2 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java @@ -16,21 +16,23 @@ package org.apache.lucene.morphology.generator; -import org.apache.lucene.morphology.dictionary.DictionaryReader; -import org.apache.lucene.morphology.dictionary.GrammaReader; -import org.apache.lucene.morphology.dictionary.StatisticsCollector; +import org.apache.lucene.morphology.dictionary.*; import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder; import java.io.IOException; +import java.util.Arrays; import java.util.HashSet; +import java.util.List; public class RussianHeuristicBuilder { public static void main(String[] args) throws IOException { GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab"); - DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet()); - RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder(); + List filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder)); + + DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet(), filters); + StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder); dictionaryReader.proccess(statisticsCollector); statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info"); diff --git a/dictionary-reader/src/test/java/org/apache/lucene/TestAllWords.java b/dictionary-reader/src/test/java/org/apache/lucene/TestAllWords.java new file mode 100644 index 0000000..54c64fa --- /dev/null +++ b/dictionary-reader/src/test/java/org/apache/lucene/TestAllWords.java @@ -0,0 +1,144 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene; + +import org.apache.lucene.morphology.*; +import org.apache.lucene.morphology.dictionary.*; +import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder; +import org.apache.lucene.morphology.russian.RussianLuceneMorphology; +import org.apache.lucene.morphology.russian.RussianMorphology; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicLong; + +import static org.hamcrest.Matchers.hasItem; +import static org.junit.Assert.assertThat; + + +public class TestAllWords { + + String prefix = ""; + + @Before + public void setUp() { + System.out.println(System.getProperty("user.dir")); + prefix = System.getProperty("user.dir").endsWith("dictionary-reader") ? "../" : ""; + + } + + @Test + public void shouldEnglishMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException { + final Morphology morphology = new EnglishMorphology(); + LetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder(); + String pathToGramma = prefix + "dictonary/Dicts/Morph/egramtab.tab"; + String pathToDict = prefix + "dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd"; + + testFullGramma(morphology, decoderEncoder, pathToGramma, pathToDict); + + } + + @Test + public void shouldRussianMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException { + final Morphology morphology = new RussianMorphology(); + LetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder(); + String pathToGramma = prefix + "dictonary/Dicts/Morph/rgramtab.tab"; + String pathToDict = prefix + "dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd"; + + testFullGramma(morphology, decoderEncoder, pathToGramma, pathToDict); + } + + private void testFullGramma(final Morphology morphology, LetterDecoderEncoder decoderEncoder, String pathToGramma, String pathToDict) throws IOException { + GrammaReader grammaInfo = new GrammaReader(pathToGramma); + final List morphInfo = grammaInfo.getGrammaInfo(); + final Map inversIndex = grammaInfo.getGrammInversIndex(); + + List filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder)); + + + DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet(), filters); + + final AtomicLong wordCount = new AtomicLong(0); + Long startTime = System.currentTimeMillis(); + + dictionaryReader.proccess(new WordProccessor() { + public void process(WordCard wordCard) throws IOException { + String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); + for (FlexiaModel fm : wordCard.getWordsForms()) { + String wordForm = wordCard.getBase() + fm.getSuffix(); + String morph = morphInfo.get(inversIndex.get(fm.getCode())); + assertThat(morphology.getMorphInfo(wordForm), hasItem(word + "|" + morph)); + assertThat(morphology.getNormalForms(wordForm), hasItem(word)); + wordCount.set(2L + wordCount.get()); + } + } + }); + + long time = System.currentTimeMillis() - startTime; + System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second"); + } + + @Test + public void shouldEnglishLuceneMorphologyIncludeAllWords() throws IOException { + final LuceneMorphology morphology = new EnglishLuceneMorphology(); + + LetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder(); + List filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder)); + String pathToDic = prefix + "dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd"; + + testAllWordForLucene(morphology, filters, pathToDic); + } + + @Test + public void shouldIncludeAllWordsRussianInLuceneMorophology() throws IOException { + final LuceneMorphology morphology = new RussianLuceneMorphology(); + + LetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder(); + List filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder)); + + String pathToDic = prefix + "dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd"; + + testAllWordForLucene(morphology, filters, pathToDic); + + } + + private void testAllWordForLucene(final LuceneMorphology morphology, List filters, String pathToDic) throws IOException { + final AtomicLong wordCount = new AtomicLong(0); + Long startTime = System.currentTimeMillis(); + + DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet(), filters); + dictionaryReader.proccess(new WordProccessor() { + public void process(WordCard wordCard) throws IOException { + String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); + for (FlexiaModel fm : wordCard.getWordsForms()) { + String wordForm = wordCard.getBase() + fm.getSuffix(); + assertThat(morphology.getNormalForms(wordForm), hasItem(word)); + wordCount.set(1L + wordCount.get()); + } + } + }); + + long time = System.currentTimeMillis() - startTime; + System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second"); + } + + +} diff --git a/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalayzersTest.java b/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalayzersTest.java new file mode 100644 index 0000000..766de9c --- /dev/null +++ b/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalayzersTest.java @@ -0,0 +1,77 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.morphology; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.morphology.russian.RussianAnalyzer; +import org.junit.Test; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.Arrays; +import java.util.HashSet; + +import static org.hamcrest.Matchers.equalTo; +import static org.junit.Assert.assertThat; + + +public class AnalayzersTest { + + @Test + public void englishAnalyzerShouldGiveCorrectWords() throws IOException { + Analyzer morphlogyAnalyzer = new EnglishAnalyzer(); + String answerPath = "/english/englsih-analayzer-answer.txt"; + String testPath = "/english/englsih-analayzer-data.txt"; + + testAnalayzer(morphlogyAnalyzer, answerPath, testPath); + } + + @Test + public void shoudGiveCorretWords() throws IOException { + Analyzer morphlogyAnalyzer = new RussianAnalyzer(); + String answerPath = "/russian/russian-analayzer-answer.txt"; + String testPath = "/russian/russian-analayzer-data.txt"; + + testAnalayzer(morphlogyAnalyzer, answerPath, testPath); + } + + private void testAnalayzer(Analyzer morphlogyAnalyzer, String answerPath, String testPath) throws IOException { + InputStream stream = this.getClass().getResourceAsStream(answerPath); + BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); + String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" "); + HashSet answer = new HashSet(Arrays.asList(strings)); + stream.close(); + + stream = this.getClass().getResourceAsStream(testPath); + + InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); + + TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader); + HashSet result = new HashSet(); + while (tokenStream.incrementToken()) { + TermAttribute attribute1 = tokenStream.getAttribute(TermAttribute.class); + result.add(attribute1.term()); + } + + stream.close(); + + assertThat(result, equalTo(answer)); + } +} diff --git a/dictionary-reader/src/test/java/org/apache/lucene/morphology/LuceneMorphTest.java b/dictionary-reader/src/test/java/org/apache/lucene/morphology/LuceneMorphTest.java new file mode 100644 index 0000000..eb91c2c --- /dev/null +++ b/dictionary-reader/src/test/java/org/apache/lucene/morphology/LuceneMorphTest.java @@ -0,0 +1,62 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.morphology; + +import org.apache.lucene.morphology.russian.RussianLuceneMorphology; +import org.junit.Test; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import static org.hamcrest.CoreMatchers.equalTo; +import static org.junit.Assert.assertThat; + + +public class LuceneMorphTest { + + @Test + public void englishMorphologyShouldGetCorrectNormalForm() throws IOException { + LuceneMorphology luceneMorph = new EnglishLuceneMorphology(); + String pathToTestData = "/english/english-morphology-test.txt"; + testMorphology(luceneMorph, pathToTestData); + } + + @Test + public void russianMorphologyShouldGetCorrectNormalForm() throws IOException { + LuceneMorphology luceneMorph = new RussianLuceneMorphology(); + String pathToTestData = "/russian/russian-morphology-test.txt"; + testMorphology(luceneMorph, pathToTestData); + } + + private void testMorphology(LuceneMorphology luceneMorph, String pathToTestData) throws IOException { + InputStream stream = this.getClass().getResourceAsStream(pathToTestData); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); + String s = bufferedReader.readLine(); + while (s != null) { + String[] qa = s.trim().split(" "); + Set result = new HashSet(); + result.addAll(Arrays.asList(qa).subList(1, qa.length)); + Set stringList = new HashSet(luceneMorph.getNormalForms(qa[0])); + assertThat(stringList, equalTo(result)); + s = bufferedReader.readLine(); + } + } +} diff --git a/dictionary-reader/src/test/resources/english/english-morphology-test.txt b/dictionary-reader/src/test/resources/english/english-morphology-test.txt new file mode 100644 index 0000000..c65e765 --- /dev/null +++ b/dictionary-reader/src/test/resources/english/english-morphology-test.txt @@ -0,0 +1,8 @@ +purchases purchase +existing exist +was be +men man +bore bore bear +grown grow grown +came come +md md \ No newline at end of file diff --git a/dictionary-reader/src/test/resources/english/englsih-analayzer-answer.txt b/dictionary-reader/src/test/resources/english/englsih-analayzer-answer.txt new file mode 100644 index 0000000..cffa6be --- /dev/null +++ b/dictionary-reader/src/test/resources/english/englsih-analayzer-answer.txt @@ -0,0 +1 @@ +following follow the instruction exactly will be help ensure the best well good result \ No newline at end of file diff --git a/dictionary-reader/src/test/resources/english/englsih-analayzer-data.txt b/dictionary-reader/src/test/resources/english/englsih-analayzer-data.txt new file mode 100644 index 0000000..5c203f8 --- /dev/null +++ b/dictionary-reader/src/test/resources/english/englsih-analayzer-data.txt @@ -0,0 +1 @@ +Following the instructions exactly will help ensure the best results \ No newline at end of file diff --git a/dictionary-reader/src/test/resources/russian/russian-analayzer-answer.txt b/dictionary-reader/src/test/resources/russian/russian-analayzer-answer.txt new file mode 100644 index 0000000..44b1843 --- /dev/null +++ b/dictionary-reader/src/test/resources/russian/russian-analayzer-answer.txt @@ -0,0 +1 @@ +в результат крушение погибнуть командир отряд специальный назначение пря при переть гувд ростовский область полковник милиция михаил перов и предприниматель \ No newline at end of file diff --git a/dictionary-reader/src/test/resources/russian/russian-analayzer-data.txt b/dictionary-reader/src/test/resources/russian/russian-analayzer-data.txt new file mode 100644 index 0000000..c97b5e9 --- /dev/null +++ b/dictionary-reader/src/test/resources/russian/russian-analayzer-data.txt @@ -0,0 +1 @@ +В результате крушения погибли командир отряда специального назначения при ГУВД Ростовской области полковник милиции Михаил Перов и предприниматель \ No newline at end of file diff --git a/dictionary-reader/src/test/resources/russian/russian-morphology-test.txt b/dictionary-reader/src/test/resources/russian/russian-morphology-test.txt new file mode 100644 index 0000000..c775e7d --- /dev/null +++ b/dictionary-reader/src/test/resources/russian/russian-morphology-test.txt @@ -0,0 +1,19 @@ +еду еда ехать +тестов тест +вина вино вина +вино вино +ехать ехать +ананасов ананас ананасовый +сухой сухой +дураков дурак +пушка пушка пушок +пушок пушок +пушек пушка +козлов козлов козловый козел +жуков жуков жук +красив красить красивый +красивая красивый +тосклив тоскливый +лучший хороший +на на +тест тест тесто \ No newline at end of file diff --git a/english/pom.xml b/english/pom.xml index 155c499..92183b4 100644 --- a/english/pom.xml +++ b/english/pom.xml @@ -1,5 +1,6 @@ - + morphology org.apache.lucene.morphology @@ -12,6 +13,7 @@ 0.9-SNAPSHOT http://maven.apache.org + org.apache.lucene.morphology morph diff --git a/english/src/main/java/org/apache/lucene/morphology/EnglishAnalyzer.java b/english/src/main/java/org/apache/lucene/morphology/EnglishAnalyzer.java new file mode 100644 index 0000000..50f40da --- /dev/null +++ b/english/src/main/java/org/apache/lucene/morphology/EnglishAnalyzer.java @@ -0,0 +1,29 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.morphology; + +import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer; + +import java.io.IOException; + + +public class EnglishAnalyzer extends MorphologyAnalyzer { + + public EnglishAnalyzer() throws IOException { + super(new EnglishLuceneMorphology()); + } + +} \ No newline at end of file diff --git a/english/src/main/java/org/apache/lucene/morphology/EnglishLetterDecoderEncoder.java b/english/src/main/java/org/apache/lucene/morphology/EnglishLetterDecoderEncoder.java new file mode 100644 index 0000000..c808b72 --- /dev/null +++ b/english/src/main/java/org/apache/lucene/morphology/EnglishLetterDecoderEncoder.java @@ -0,0 +1,111 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.morphology; + +import java.util.ArrayList; + + +public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder { + public static final int ENGLISH_SMALL_LETTER_OFFSET = 96; + static public int SUFFIX_LENGTH = 6; + public static final int DASH_CHAR = 45; + public static final int DASH_CODE = 27; + + public Integer encode(String string) { + if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12); + int result = 0; + for (int i = 0; i < string.length(); i++) { + int c = 0 + string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET; + if (c == 45 - ENGLISH_SMALL_LETTER_OFFSET) { + c = DASH_CODE; + } + if (c < 0 || c > 27) + throw new WrongCharaterException("Symblo " + string.charAt(i) + " is not small cirillic letter"); + result = result * 28 + c; + } + for (int i = string.length(); i < 6; i++) { + result *= 28; + } + return result; + } + + public int[] encodeToArray(String s) { + + ArrayList integers = new ArrayList(); + while (s.length() > 6) { + integers.add(encode(s.substring(0, 6))); + s = s.substring(6); + } + integers.add(encode(s)); + int[] ints = new int[integers.size()]; + int pos = 0; + for (Integer i : integers) { + ints[pos] = i; + pos++; + } + return ints; + } + + public String decodeArray(int[] array) { + String result = ""; + for (int i : array) { + result += decode(i); + } + return result; + } + + + public String decode(Integer suffixN) { + String result = ""; + while (suffixN > 27) { + int c = suffixN % 28 + ENGLISH_SMALL_LETTER_OFFSET; + if (c == ENGLISH_SMALL_LETTER_OFFSET) { + suffixN /= 28; + continue; + } + if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR; + result = (char) c + result; + suffixN /= 28; + } + long c = suffixN + ENGLISH_SMALL_LETTER_OFFSET; + if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR; + result = (char) c + result; + return result; + } + + public boolean checkCharacter(char c) { + int code = 0 + c; + if (code == 45) return true; + code -= ENGLISH_SMALL_LETTER_OFFSET; + if (code > 0 && code < 27) return true; + return false; + } + + + public boolean checkString(String word) { + for (int i = 0; i < word.length(); i++) { + if (!checkCharacter(word.charAt(i))) { + return false; + } + } + return true; + } + + public String cleanString(String s) { + return s; + } + +} diff --git a/english/src/main/java/org/apache/lucene/morphology/EnglishLuceneMorphology.java b/english/src/main/java/org/apache/lucene/morphology/EnglishLuceneMorphology.java new file mode 100644 index 0000000..b2c3f31 --- /dev/null +++ b/english/src/main/java/org/apache/lucene/morphology/EnglishLuceneMorphology.java @@ -0,0 +1,26 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.morphology; + +import java.io.IOException; + + +public class EnglishLuceneMorphology extends LuceneMorphology { + + public EnglishLuceneMorphology() throws IOException { + super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder()); + } +} \ No newline at end of file diff --git a/english/src/main/java/org/apache/lucene/morphology/EnglishMorphology.java b/english/src/main/java/org/apache/lucene/morphology/EnglishMorphology.java new file mode 100644 index 0000000..e3e9e29 --- /dev/null +++ b/english/src/main/java/org/apache/lucene/morphology/EnglishMorphology.java @@ -0,0 +1,26 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.morphology; + +import java.io.IOException; + + +public class EnglishMorphology extends MorphologyImpl { + + public EnglishMorphology() throws IOException { + super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder()); + } +} diff --git a/russian/pom.xml b/russian/pom.xml index 03789c2..a6820f3 100644 --- a/russian/pom.xml +++ b/russian/pom.xml @@ -1,5 +1,6 @@ - + morphology org.apache.lucene.morphology @@ -13,6 +14,7 @@ http://maven.apache.org + org.apache.lucene.morphology morph diff --git a/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoderTest.java b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoderTest.java index de67e2a..73a2d38 100644 --- a/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoderTest.java +++ b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoderTest.java @@ -17,8 +17,6 @@ package org.apache.lucene.morphology.russian; import org.apache.lucene.morphology.SuffixToLongException; import org.apache.lucene.morphology.WrongCharaterException; -import static org.hamcrest.core.IsEqual.equalTo; -import static org.junit.Assert.assertThat; import org.junit.Before; import org.junit.Test; @@ -27,6 +25,9 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import static org.hamcrest.core.IsEqual.equalTo; +import static org.junit.Assert.assertThat; + public class RussianLetterDecoderEncoderTest { private RussianLetterDecoderEncoder decoderEncoder; @@ -37,7 +38,7 @@ public class RussianLetterDecoderEncoderTest { @Test - public void testShouldPreserStringComporision() throws IOException { + public void testShouldPreserverStringComporision() throws IOException { InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-monotonic.txt"); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); String s = bufferedReader.readLine(); @@ -52,22 +53,22 @@ public class RussianLetterDecoderEncoderTest { @Test - public void testShouldCorretDecodeEncode() throws IOException { + public void testShouldCorrectDecodeEncode() throws IOException { InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data.txt"); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); String s = bufferedReader.readLine(); while (s != null) { String[] qa = s.trim().split(" "); if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) { - Integer ecodedSuffix = decoderEncoder.encode(qa[0]); - assertThat(decoderEncoder.decode(ecodedSuffix), equalTo(qa[1])); + Integer encodedSuffix = decoderEncoder.encode(qa[0]); + assertThat(decoderEncoder.decode(encodedSuffix), equalTo(qa[1])); } s = bufferedReader.readLine(); } } @Test - public void testShouldCorretDecodeEncodeStringToArray() throws IOException { + public void testShouldCorrectDecodeEncodeStringToArray() throws IOException { InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data-for-array.txt"); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); String s = bufferedReader.readLine(); @@ -85,7 +86,7 @@ public class RussianLetterDecoderEncoderTest { } @Test(expected = WrongCharaterException.class) - public void shouldThrownExeptionIfSuffixContainWrongCharater() { + public void shouldThrownExceptionIfSuffixContainWrongCharater() { decoderEncoder.encode("1"); } }