diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictionaryReader.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictionaryReader.java index 9197ddf..37a4794 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictionaryReader.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictionaryReader.java @@ -21,7 +21,10 @@ import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; -import java.util.*; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; /** @@ -32,14 +35,11 @@ public class DictionaryReader { private String fileName; private String fileEncoding = "windows-1251"; private List> wordsFlexias = new ArrayList>(); - private List> wordPrefixes = new ArrayList>(); private Set ignoredForm = new HashSet(); - private List filters = new ArrayList(); - public DictionaryReader(String fileName, Set ignoredForm, List filters) { + public DictionaryReader(String fileName, Set ignoredForm) { this.fileName = fileName; this.ignoredForm = ignoredForm; - this.filters = filters; } @@ -63,11 +63,6 @@ public class DictionaryReader { WordCard card = buildForm(s); - for (WordFilter wf : filters) { - if (card == null) break; - card = wf.transform(card); - } - if (card == null) { continue; } @@ -112,8 +107,7 @@ public class DictionaryReader { String s = reader.readLine(); int count = Integer.valueOf(s); for (int i = 0; i < count; i++) { - s = reader.readLine(); - wordPrefixes.add(Arrays.asList(s.toLowerCase().split(","))); + reader.readLine(); } } @@ -135,7 +129,7 @@ public class DictionaryReader { // we inored all forms thats if (fl.length == 3) { //System.out.println(line); - // flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase()))); + flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase())); } if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), "")); } diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/RemoveFlexiaWithPrefixes.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/RemoveFlexiaWithPrefixes.java new file mode 100644 index 0000000..612896f --- /dev/null +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/RemoveFlexiaWithPrefixes.java @@ -0,0 +1,44 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.morphology.dictionary; + +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + + +public class RemoveFlexiaWithPrefixes extends WordFilter { + + public RemoveFlexiaWithPrefixes(WordProcessor wordProcessor) { + super(wordProcessor); + } + + @Override + public List transform(WordCard wordCard) { + + List flexiaModelsToRemove = new LinkedList(); + for (FlexiaModel fm : wordCard.getWordsForms()) { + if (fm.getPrefix().length() > 0) { + flexiaModelsToRemove.add(fm); + } + } + for (FlexiaModel fm : flexiaModelsToRemove) { + wordCard.removeFlexia(fm); + } + + return new LinkedList(Arrays.asList(wordCard)); + } +} diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/RussianAdvSplitterFilter.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/RussianAdvSplitterFilter.java new file mode 100644 index 0000000..6818190 --- /dev/null +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/RussianAdvSplitterFilter.java @@ -0,0 +1,61 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.morphology.dictionary; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.LinkedList; +import java.util.List; + + +public class RussianAdvSplitterFilter extends WordFilter { + private String code; + + public RussianAdvSplitterFilter(WordProcessor wordProcessor) throws IOException { + super(wordProcessor); + code = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream("/russian-adv-main-code.txt"), "windows-1251")).readLine(); + } + + @Override + public List transform(WordCard wordCard) { + LinkedList result = new LinkedList(); + result.add(wordCard); + + String baseWord = ""; + String canonicalForm = ""; + String canonicalSuffix = ""; + List flexiaModels = new LinkedList(); + for (FlexiaModel flexiaModel : wordCard.getWordsForms()) { + if (flexiaModel.getPrefix().length() > 0) { + flexiaModels.add(new FlexiaModel(flexiaModel.getCode(), flexiaModel.getSuffix(), "")); + } + if (flexiaModel.getPrefix().length() > 0 && flexiaModel.getCode().equals(code)) { + baseWord = flexiaModel.getPrefix() + wordCard.getBase(); + canonicalForm = flexiaModel.getCode(); + canonicalSuffix = flexiaModel.getSuffix(); + } + } + + if (baseWord.length() > 0) { + WordCard wc = new WordCard(canonicalForm, baseWord, canonicalSuffix); + wc.setWordsForms(flexiaModels); + result.add(wc); + } + + return result; + } +} diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCleaner.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCleaner.java index 1f16bed..4e6ae45 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCleaner.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCleaner.java @@ -17,23 +17,26 @@ package org.apache.lucene.morphology.dictionary; import org.apache.lucene.morphology.LetterDecoderEncoder; +import java.util.Arrays; +import java.util.Collections; import java.util.LinkedList; import java.util.List; -public class WordCleaner implements WordFilter { +public class WordCleaner extends WordFilter { private LetterDecoderEncoder decoderEncoder; - public WordCleaner(LetterDecoderEncoder decoderEncoder) { + public WordCleaner(LetterDecoderEncoder decoderEncoder, WordProcessor wordProcessor) { + super(wordProcessor); this.decoderEncoder = decoderEncoder; } - public WordCard transform(WordCard wordCard) { + public List transform(WordCard wordCard) { String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); - if (word.contains("-")) return null; - if (!decoderEncoder.checkString(word)) return null; + if (word.contains("-")) return Collections.emptyList(); + if (!decoderEncoder.checkString(word)) return Collections.emptyList(); List flexiaModelsToRemove = new LinkedList(); for (FlexiaModel fm : wordCard.getWordsForms()) { @@ -45,6 +48,6 @@ public class WordCleaner implements WordFilter { wordCard.removeFlexia(fm); } - return wordCard; + return new LinkedList(Arrays.asList(wordCard)); } } diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordFilter.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordFilter.java index 33ea89b..b18a903 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordFilter.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordFilter.java @@ -16,9 +16,22 @@ package org.apache.lucene.morphology.dictionary; +import java.io.IOException; +import java.util.List; -public interface WordFilter { - public WordCard transform(WordCard wordCard); +abstract public class WordFilter implements WordProcessor { + private WordProcessor wordProcessor; + public WordFilter(WordProcessor wordProcessor) { + this.wordProcessor = wordProcessor; + } + + abstract public List transform(WordCard wordCard); + + public void process(WordCard wordCard) throws IOException { + for (WordCard wc : transform(wordCard)) { + wordProcessor.process(wc); + } + } } diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordStringCleaner.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordStringCleaner.java index 12a1eb2..d6f2c2e 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordStringCleaner.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordStringCleaner.java @@ -17,18 +17,21 @@ package org.apache.lucene.morphology.dictionary; import org.apache.lucene.morphology.LetterDecoderEncoder; +import java.util.Arrays; +import java.util.LinkedList; import java.util.List; -public class WordStringCleaner implements WordFilter { +public class WordStringCleaner extends WordFilter { private LetterDecoderEncoder decoderEncoder; - public WordStringCleaner(LetterDecoderEncoder decoderEncoder) { + public WordStringCleaner(LetterDecoderEncoder decoderEncoder, WordProcessor wordProcessor) { + super(wordProcessor); this.decoderEncoder = decoderEncoder; } - public WordCard transform(WordCard wordCard) { + public List transform(WordCard wordCard) { wordCard.setBase(cleanString(wordCard.getBase())); wordCard.setCanonicalForm(cleanString(wordCard.getCanonicalForm())); wordCard.setCanonicalSuffix(cleanString(wordCard.getCanonicalSuffix())); @@ -39,7 +42,7 @@ public class WordStringCleaner implements WordFilter { //made correct code m.setCode(m.getCode().substring(0, 2)); } - return wordCard; + return new LinkedList(Arrays.asList(wordCard)); } diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java index 49ceeb4..f8d9709 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java @@ -20,9 +20,7 @@ import org.apache.lucene.morphology.EnglishLetterDecoderEncoder; import org.apache.lucene.morphology.dictionary.*; import java.io.IOException; -import java.util.Arrays; import java.util.HashSet; -import java.util.List; public class EnglishHeuristicBuilder { @@ -30,12 +28,14 @@ public class EnglishHeuristicBuilder { GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/egramtab.tab"); EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder(); - List filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder)); - DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet(), filters); + DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet()); StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder); - dictionaryReader.process(statisticsCollector); + WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector); + WordStringCleaner wordStringCleaner = new WordStringCleaner(decoderEncoder, wordCleaner); + RemoveFlexiaWithPrefixes removeFlexiaWithPrefixes = new RemoveFlexiaWithPrefixes(wordStringCleaner); + dictionaryReader.process(removeFlexiaWithPrefixes); statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info"); } diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java index 2dfa8af..9c68d88 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java @@ -20,21 +20,22 @@ import org.apache.lucene.morphology.dictionary.*; import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder; import java.io.IOException; -import java.util.Arrays; import java.util.HashSet; -import java.util.List; public class RussianHeuristicBuilder { public static void main(String[] args) throws IOException { GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/rgramtab.tab"); RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder(); - List filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder)); - DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet(), filters); + DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet()); StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder); - dictionaryReader.process(statisticsCollector); + WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector); + WordStringCleaner wordStringCleaner = new WordStringCleaner(decoderEncoder, wordCleaner); + RemoveFlexiaWithPrefixes removeFlexiaWithPrefixes = new RemoveFlexiaWithPrefixes(wordStringCleaner); + RussianAdvSplitterFilter russianAdvSplitterFilter = new RussianAdvSplitterFilter(removeFlexiaWithPrefixes); + dictionaryReader.process(russianAdvSplitterFilter); statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info"); } diff --git a/dictionary-reader/src/main/resources/russian-adv-main-code.txt b/dictionary-reader/src/main/resources/russian-adv-main-code.txt new file mode 100644 index 0000000..8b17ac1 --- /dev/null +++ b/dictionary-reader/src/main/resources/russian-adv-main-code.txt @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/dictionary-reader/src/test/java/org/apache/lucene/TestAllWords.java b/dictionary-reader/src/test/java/org/apache/lucene/TestAllWords.java index e5a02a5..bf7bc89 100644 --- a/dictionary-reader/src/test/java/org/apache/lucene/TestAllWords.java +++ b/dictionary-reader/src/test/java/org/apache/lucene/TestAllWords.java @@ -24,7 +24,6 @@ import org.junit.Before; import org.junit.Test; import java.io.IOException; -import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -71,15 +70,12 @@ public class TestAllWords { final List morphInfo = grammarInfo.getGrammarInfo(); final Map inversIndex = grammarInfo.getGrammarInverseIndex(); - List filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder)); - - - DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet(), filters); + DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet()); final AtomicLong wordCount = new AtomicLong(0); Long startTime = System.currentTimeMillis(); - dictionaryReader.process(new WordProcessor() { + WordProcessor wordProcessor = new WordProcessor() { public void process(WordCard wordCard) throws IOException { String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); for (FlexiaModel fm : wordCard.getWordsForms()) { @@ -90,7 +86,12 @@ public class TestAllWords { wordCount.set(2L + wordCount.get()); } } - }); + }; + + WordCleaner wordCleaner = new WordCleaner(decoderEncoder, wordProcessor); + WordStringCleaner wordStringCleaner = new WordStringCleaner(decoderEncoder, wordCleaner); + RemoveFlexiaWithPrefixes removeFlexiaWithPrefixes = new RemoveFlexiaWithPrefixes(wordStringCleaner); + dictionaryReader.process(removeFlexiaWithPrefixes); long time = System.currentTimeMillis() - startTime; System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second"); @@ -101,10 +102,9 @@ public class TestAllWords { final LuceneMorphology morphology = new EnglishLuceneMorphology(); LetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder(); - List filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder)); String pathToDic = prefix + "dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd"; - testAllWordForLucene(morphology, filters, pathToDic); + testAllWordForLucene(morphology, decoderEncoder, pathToDic); } @Test @@ -112,20 +112,19 @@ public class TestAllWords { final LuceneMorphology morphology = new RussianLuceneMorphology(); LetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder(); - List filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder)); String pathToDic = prefix + "dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd"; - testAllWordForLucene(morphology, filters, pathToDic); + testAllWordForLucene(morphology, decoderEncoder, pathToDic); } - private void testAllWordForLucene(final LuceneMorphology morphology, List filters, String pathToDic) throws IOException { + private void testAllWordForLucene(final LuceneMorphology morphology, LetterDecoderEncoder decoderEncoder, String pathToDic) throws IOException { final AtomicLong wordCount = new AtomicLong(0); Long startTime = System.currentTimeMillis(); - DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet(), filters); - dictionaryReader.process(new WordProcessor() { + DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet()); + WordProcessor wordProcessor = new WordProcessor() { public void process(WordCard wordCard) throws IOException { String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); for (FlexiaModel fm : wordCard.getWordsForms()) { @@ -134,7 +133,12 @@ public class TestAllWords { wordCount.set(1L + wordCount.get()); } } - }); + }; + + WordCleaner wordCleaner = new WordCleaner(decoderEncoder, wordProcessor); + WordStringCleaner wordStringCleaner = new WordStringCleaner(decoderEncoder, wordCleaner); + RemoveFlexiaWithPrefixes removeFlexiaWithPrefixes = new RemoveFlexiaWithPrefixes(wordStringCleaner); + dictionaryReader.process(removeFlexiaWithPrefixes); long time = System.currentTimeMillis() - startTime; System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second"); diff --git a/dictionary-reader/src/test/resources/russian/russian-morphology-test.txt b/dictionary-reader/src/test/resources/russian/russian-morphology-test.txt index c775e7d..dc18b42 100644 --- a/dictionary-reader/src/test/resources/russian/russian-morphology-test.txt +++ b/dictionary-reader/src/test/resources/russian/russian-morphology-test.txt @@ -16,4 +16,5 @@ тосклив тоскливый лучший хороший на на -тест тест тесто \ No newline at end of file +тест тест тесто +наибольшую наибольший \ No newline at end of file