adding support of comparativ degree of adjective. Now it is treated as separete word form.

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@101 d817d54c-26ab-11de-abc9-2f7d1455ff7a
2010-10-10 20:55:46 +00:00
parent 3de894404c
commit ba5272acb8
11 changed files with 176 additions and 51 deletions
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictionaryReader.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictionaryReader.java
@@ -21,7 +21,10 @@ import java.io.BufferedReader;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;


 /**
@@ -32,14 +35,11 @@ public class DictionaryReader {
    private String fileName;
    private String fileEncoding = "windows-1251";
    private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
-    private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
    private Set<String> ignoredForm = new HashSet<String>();
-    private List<WordFilter> filters = new ArrayList<WordFilter>();

-    public DictionaryReader(String fileName, Set<String> ignoredForm, List<WordFilter> filters) {
+    public DictionaryReader(String fileName, Set<String> ignoredForm) {
        this.fileName = fileName;
        this.ignoredForm = ignoredForm;
-        this.filters = filters;
    }


@@ -63,11 +63,6 @@ public class DictionaryReader {

            WordCard card = buildForm(s);

-            for (WordFilter wf : filters) {
-                if (card == null) break;
-                card = wf.transform(card);
-            }
-
            if (card == null) {
                continue;
            }
@@ -112,8 +107,7 @@ public class DictionaryReader {
        String s = reader.readLine();
        int count = Integer.valueOf(s);
        for (int i = 0; i < count; i++) {
-            s = reader.readLine();
-            wordPrefixes.add(Arrays.asList(s.toLowerCase().split(",")));
+            reader.readLine();
        }
    }

@@ -135,7 +129,7 @@ public class DictionaryReader {
        // we inored all forms thats
        if (fl.length == 3) {
            //System.out.println(line);
-            // flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase())));
+            flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
        }
        if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
    }
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/RemoveFlexiaWithPrefixes.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/RemoveFlexiaWithPrefixes.java
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology.dictionary;
+
+import java.util.Arrays;
+import java.util.LinkedList;
+import java.util.List;
+
+
+public class RemoveFlexiaWithPrefixes extends WordFilter {
+
+    public RemoveFlexiaWithPrefixes(WordProcessor wordProcessor) {
+        super(wordProcessor);
+    }
+
+    @Override
+    public List<WordCard> transform(WordCard wordCard) {
+
+        List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>();
+        for (FlexiaModel fm : wordCard.getWordsForms()) {
+            if (fm.getPrefix().length() > 0) {
+                flexiaModelsToRemove.add(fm);
+            }
+        }
+        for (FlexiaModel fm : flexiaModelsToRemove) {
+            wordCard.removeFlexia(fm);
+        }
+
+        return new LinkedList<WordCard>(Arrays.asList(wordCard));
+    }
+}
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/RussianAdvSplitterFilter.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/RussianAdvSplitterFilter.java
@@ -0,0 +1,61 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology.dictionary;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.LinkedList;
+import java.util.List;
+
+
+public class RussianAdvSplitterFilter extends WordFilter {
+    private String code;
+
+    public RussianAdvSplitterFilter(WordProcessor wordProcessor) throws IOException {
+        super(wordProcessor);
+        code = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream("/russian-adv-main-code.txt"), "windows-1251")).readLine();
+    }
+
+    @Override
+    public List<WordCard> transform(WordCard wordCard) {
+        LinkedList<WordCard> result = new LinkedList<WordCard>();
+        result.add(wordCard);
+
+        String baseWord = "";
+        String canonicalForm = "";
+        String canonicalSuffix = "";
+        List<FlexiaModel> flexiaModels = new LinkedList<FlexiaModel>();
+        for (FlexiaModel flexiaModel : wordCard.getWordsForms()) {
+            if (flexiaModel.getPrefix().length() > 0) {
+                flexiaModels.add(new FlexiaModel(flexiaModel.getCode(), flexiaModel.getSuffix(), ""));
+            }
+            if (flexiaModel.getPrefix().length() > 0 && flexiaModel.getCode().equals(code)) {
+                baseWord = flexiaModel.getPrefix() + wordCard.getBase();
+                canonicalForm = flexiaModel.getCode();
+                canonicalSuffix = flexiaModel.getSuffix();
+            }
+        }
+
+        if (baseWord.length() > 0) {
+            WordCard wc = new WordCard(canonicalForm, baseWord, canonicalSuffix);
+            wc.setWordsForms(flexiaModels);
+            result.add(wc);
+        }
+
+        return result;
+    }
+}
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCleaner.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCleaner.java
@@ -17,23 +17,26 @@ package org.apache.lucene.morphology.dictionary;

 import org.apache.lucene.morphology.LetterDecoderEncoder;

+import java.util.Arrays;
+import java.util.Collections;
 import java.util.LinkedList;
 import java.util.List;


-public class WordCleaner implements WordFilter {
+public class WordCleaner extends WordFilter {

    private LetterDecoderEncoder decoderEncoder;

-    public WordCleaner(LetterDecoderEncoder decoderEncoder) {
+    public WordCleaner(LetterDecoderEncoder decoderEncoder, WordProcessor wordProcessor) {
+        super(wordProcessor);
        this.decoderEncoder = decoderEncoder;
    }

-    public WordCard transform(WordCard wordCard) {
+    public List<WordCard> transform(WordCard wordCard) {
        String word = wordCard.getBase() + wordCard.getCanonicalSuffix();

-        if (word.contains("-")) return null;
-        if (!decoderEncoder.checkString(word)) return null;
+        if (word.contains("-")) return Collections.emptyList();
+        if (!decoderEncoder.checkString(word)) return Collections.emptyList();

        List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>();
        for (FlexiaModel fm : wordCard.getWordsForms()) {
@@ -45,6 +48,6 @@ public class WordCleaner implements WordFilter {
            wordCard.removeFlexia(fm);
        }

-        return wordCard;
+        return new LinkedList<WordCard>(Arrays.asList(wordCard));
    }
 }
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordFilter.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordFilter.java
@@ -16,9 +16,22 @@

 package org.apache.lucene.morphology.dictionary;

+import java.io.IOException;
+import java.util.List;

-public interface WordFilter {

-    public WordCard transform(WordCard wordCard);
+abstract public class WordFilter implements WordProcessor {
+    private WordProcessor wordProcessor;

+    public WordFilter(WordProcessor wordProcessor) {
+        this.wordProcessor = wordProcessor;
+    }
+
+    abstract public List<WordCard> transform(WordCard wordCard);
+
+    public void process(WordCard wordCard) throws IOException {
+        for (WordCard wc : transform(wordCard)) {
+            wordProcessor.process(wc);
+        }
+    }
 }
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordStringCleaner.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordStringCleaner.java
@@ -17,18 +17,21 @@ package org.apache.lucene.morphology.dictionary;

 import org.apache.lucene.morphology.LetterDecoderEncoder;

+import java.util.Arrays;
+import java.util.LinkedList;
 import java.util.List;


-public class WordStringCleaner implements WordFilter {
+public class WordStringCleaner extends WordFilter {

    private LetterDecoderEncoder decoderEncoder;

-    public WordStringCleaner(LetterDecoderEncoder decoderEncoder) {
+    public WordStringCleaner(LetterDecoderEncoder decoderEncoder, WordProcessor wordProcessor) {
+        super(wordProcessor);
        this.decoderEncoder = decoderEncoder;
    }

-    public WordCard transform(WordCard wordCard) {
+    public List<WordCard> transform(WordCard wordCard) {
        wordCard.setBase(cleanString(wordCard.getBase()));
        wordCard.setCanonicalForm(cleanString(wordCard.getCanonicalForm()));
        wordCard.setCanonicalSuffix(cleanString(wordCard.getCanonicalSuffix()));
@@ -39,7 +42,7 @@ public class WordStringCleaner implements WordFilter {
            //made correct code
            m.setCode(m.getCode().substring(0, 2));
        }
-        return wordCard;
+        return new LinkedList<WordCard>(Arrays.asList(wordCard));
    }


--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java
@@ -20,9 +20,7 @@ import org.apache.lucene.morphology.EnglishLetterDecoderEncoder;
 import org.apache.lucene.morphology.dictionary.*;

 import java.io.IOException;
-import java.util.Arrays;
 import java.util.HashSet;
-import java.util.List;


 public class EnglishHeuristicBuilder {
@@ -30,12 +28,14 @@ public class EnglishHeuristicBuilder {

        GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/egramtab.tab");
        EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
-        List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));

-        DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>(), filters);
+        DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());

        StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder);
-        dictionaryReader.process(statisticsCollector);
+        WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector);
+        WordStringCleaner wordStringCleaner = new WordStringCleaner(decoderEncoder, wordCleaner);
+        RemoveFlexiaWithPrefixes removeFlexiaWithPrefixes = new RemoveFlexiaWithPrefixes(wordStringCleaner);
+        dictionaryReader.process(removeFlexiaWithPrefixes);
        statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");

    }
--- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java
@@ -20,21 +20,22 @@ import org.apache.lucene.morphology.dictionary.*;
 import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;

 import java.io.IOException;
-import java.util.Arrays;
 import java.util.HashSet;
-import java.util.List;


 public class RussianHeuristicBuilder {
    public static void main(String[] args) throws IOException {
        GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/rgramtab.tab");
        RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
-        List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));

-        DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>(), filters);
+        DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());

        StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder);
-        dictionaryReader.process(statisticsCollector);
+        WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector);
+        WordStringCleaner wordStringCleaner = new WordStringCleaner(decoderEncoder, wordCleaner);
+        RemoveFlexiaWithPrefixes removeFlexiaWithPrefixes = new RemoveFlexiaWithPrefixes(wordStringCleaner);
+        RussianAdvSplitterFilter russianAdvSplitterFilter = new RussianAdvSplitterFilter(removeFlexiaWithPrefixes);
+        dictionaryReader.process(russianAdvSplitterFilter);
        statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");

    }
--- a/dictionary-reader/src/main/resources/russian-adv-main-code.txt
+++ b/dictionary-reader/src/main/resources/russian-adv-main-code.txt
@@ -0,0 +1 @@
+<EFBFBD><EFBFBD>
--- a/dictionary-reader/src/test/java/org/apache/lucene/TestAllWords.java
+++ b/dictionary-reader/src/test/java/org/apache/lucene/TestAllWords.java
@@ -24,7 +24,6 @@ import org.junit.Before;
 import org.junit.Test;

 import java.io.IOException;
-import java.util.Arrays;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
@@ -71,15 +70,12 @@ public class TestAllWords {
        final List<String> morphInfo = grammarInfo.getGrammarInfo();
        final Map<String, Integer> inversIndex = grammarInfo.getGrammarInverseIndex();

-        List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
-
-
-        DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet<String>(), filters);
+        DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet<String>());

        final AtomicLong wordCount = new AtomicLong(0);
        Long startTime = System.currentTimeMillis();

-        dictionaryReader.process(new WordProcessor() {
+        WordProcessor wordProcessor = new WordProcessor() {
            public void process(WordCard wordCard) throws IOException {
                String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
                for (FlexiaModel fm : wordCard.getWordsForms()) {
@@ -90,7 +86,12 @@ public class TestAllWords {
                    wordCount.set(2L + wordCount.get());
                }
            }
-        });
+        };
+
+        WordCleaner wordCleaner = new WordCleaner(decoderEncoder, wordProcessor);
+        WordStringCleaner wordStringCleaner = new WordStringCleaner(decoderEncoder, wordCleaner);
+        RemoveFlexiaWithPrefixes removeFlexiaWithPrefixes = new RemoveFlexiaWithPrefixes(wordStringCleaner);
+        dictionaryReader.process(removeFlexiaWithPrefixes);

        long time = System.currentTimeMillis() - startTime;
        System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second");
@@ -101,10 +102,9 @@ public class TestAllWords {
        final LuceneMorphology morphology = new EnglishLuceneMorphology();

        LetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
-        List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
        String pathToDic = prefix + "dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd";

-        testAllWordForLucene(morphology, filters, pathToDic);
+        testAllWordForLucene(morphology, decoderEncoder, pathToDic);
    }

    @Test
@@ -112,20 +112,19 @@ public class TestAllWords {
        final LuceneMorphology morphology = new RussianLuceneMorphology();

        LetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
-        List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));

        String pathToDic = prefix + "dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd";

-        testAllWordForLucene(morphology, filters, pathToDic);
+        testAllWordForLucene(morphology, decoderEncoder, pathToDic);

    }

-    private void testAllWordForLucene(final LuceneMorphology morphology, List<WordFilter> filters, String pathToDic) throws IOException {
+    private void testAllWordForLucene(final LuceneMorphology morphology, LetterDecoderEncoder decoderEncoder, String pathToDic) throws IOException {
        final AtomicLong wordCount = new AtomicLong(0);
        Long startTime = System.currentTimeMillis();

-        DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<String>(), filters);
-        dictionaryReader.process(new WordProcessor() {
+        DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<String>());
+        WordProcessor wordProcessor = new WordProcessor() {
            public void process(WordCard wordCard) throws IOException {
                String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
                for (FlexiaModel fm : wordCard.getWordsForms()) {
@@ -134,7 +133,12 @@ public class TestAllWords {
                    wordCount.set(1L + wordCount.get());
                }
            }
-        });
+        };
+
+        WordCleaner wordCleaner = new WordCleaner(decoderEncoder, wordProcessor);
+        WordStringCleaner wordStringCleaner = new WordStringCleaner(decoderEncoder, wordCleaner);
+        RemoveFlexiaWithPrefixes removeFlexiaWithPrefixes = new RemoveFlexiaWithPrefixes(wordStringCleaner);
+        dictionaryReader.process(removeFlexiaWithPrefixes);

        long time = System.currentTimeMillis() - startTime;
        System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second");
--- a/dictionary-reader/src/test/resources/russian/russian-morphology-test.txt
+++ b/dictionary-reader/src/test/resources/russian/russian-morphology-test.txt
@@ -16,4 +16,5 @@
 тосклив тоскливый
 лучший хороший
 на на
-тест тест тесто
+тест тест тесто
+наибольшую наибольший