adding modules

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@49 d817d54c-26ab-11de-abc9-2f7d1455ff7a
2009-10-02 16:25:08 +00:00
parent 786ce92ae0
commit 710384987c
36 changed files with 221 additions and 695427 deletions
--- a/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/DictonaryReader.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/DictonaryReader.java
@@ -0,0 +1,125 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov 
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.morpholgy.dictionary;
+
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.*;
+
+
+/**
+ * This class contain logic how read
+ * dictonary and produce word with it all forms.
+ */
+public class DictonaryReader {
+    private String fileName;
+    private String fileEncoding = "windows-1251";
+    private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
+    private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
+    private Set<String> ingnoredForm = new HashSet<String>();
+
+    public DictonaryReader(String fileName, Set<String> ingnoredForm) {
+        this.fileName = fileName;
+        this.ingnoredForm = ingnoredForm;
+    }
+
+    public DictonaryReader(String fileName, String fileEncoding, Set<String> ingnoredForm) {
+        this.fileName = fileName;
+        this.fileEncoding = fileEncoding;
+        this.ingnoredForm = ingnoredForm;
+    }
+
+
+    public void proccess(WordProccessor wordProccessor) throws IOException {
+        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), fileEncoding));
+        readFlexias(bufferedReader);
+        sckipBlock(bufferedReader);
+        sckipBlock(bufferedReader);
+        readPrefix(bufferedReader);
+        readWords(bufferedReader, wordProccessor);
+    }
+
+
+    private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
+        String s = reader.readLine();
+        int count = Integer.valueOf(s);
+        for (int i = 0; i < count; i++) {
+            s = reader.readLine();
+            if (i % 10000 == 0) System.out.println("Proccess " + i + " wordBase of " + count);
+
+            String[] wd = s.split(" ");
+            String wordBase = wd[0].toLowerCase();
+            if (wordBase.startsWith("-")) continue;
+            wordBase = "#".equals(wordBase) ? "" : wordBase;
+            List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
+            FlexiaModel flexiaModel = models.get(0);
+            if (models.size() > 0 && !ingnoredForm.contains(flexiaModel.getCode())) {
+                WordCard card = new WordCard(flexiaModel.create(wordBase), wordBase, flexiaModel.getSuffix());
+                for (FlexiaModel fm : models) {
+                    card.addFlexia(fm);
+                }
+                wordProccessor.proccess(card);
+            }
+        }
+    }
+
+
+    private void sckipBlock(BufferedReader reader) throws IOException {
+        String s = reader.readLine();
+        int count = Integer.valueOf(s);
+        for (int i = 0; i < count; i++) {
+            s = reader.readLine();
+        }
+    }
+
+
+    private void readPrefix(BufferedReader reader) throws IOException {
+        String s = reader.readLine();
+        int count = Integer.valueOf(s);
+        for (int i = 0; i < count; i++) {
+            s = reader.readLine();
+            wordPrefixes.add(Arrays.asList(s.toLowerCase().split(",")));
+        }
+    }
+
+    private void readFlexias(BufferedReader reader) throws IOException {
+        String s = reader.readLine();
+        int count = Integer.valueOf(s);
+        for (int i = 0; i < count; i++) {
+            s = reader.readLine();
+            ArrayList<FlexiaModel> flexiaModelArrayList = new ArrayList<FlexiaModel>();
+            wordsFlexias.add(flexiaModelArrayList);
+            for (String line : s.split("%")) {
+                addFlexia(flexiaModelArrayList, line);
+            }
+        }
+    }
+
+    private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
+        String[] fl = line.split("\\*");
+        // we inored all forms thats
+        if (fl.length == 3) {
+            System.out.println(line);
+            // flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase())));
+        }
+        if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
+    }
+
+}
--- a/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/FlexiaModel.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/FlexiaModel.java
@@ -0,0 +1,65 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov 
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.morpholgy.dictionary;
+
+/**
+ * Represent inofrmation of how word form created form it imutible part.
+ */
+public class FlexiaModel {
+    private String code;
+    private String suffix;
+    private String prefix;
+
+    public FlexiaModel(String code, String suffix, String prefix) {
+        this.code = code;
+        this.suffix = suffix;
+        this.prefix = prefix;
+    }
+
+    public String getCode() {
+        return code;
+    }
+
+    public void setCode(String code) {
+        this.code = code;
+    }
+
+    public String getSuffix() {
+        return suffix;
+    }
+
+    public void setSuffix(String suffix) {
+        this.suffix = suffix;
+    }
+
+    public String getPrefix() {
+        return prefix;
+    }
+
+    public void setPrefix(String prefix) {
+        this.prefix = prefix;
+    }
+
+    public String create(String s) {
+        return prefix + s + suffix;
+    }
+
+    @Override
+    public String toString() {
+        return prefix + " " + suffix;
+    }
+}
--- a/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/FrequentyReader.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/FrequentyReader.java
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.morpholgy.dictionary;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.HashMap;
+import java.util.Map;
+
+
+public class FrequentyReader {
+    private String fileName;
+    private String fileEncoding = "windows-1251";
+
+    public FrequentyReader(String fileName) {
+        this.fileName = fileName;
+    }
+
+    public FrequentyReader(String fileName, String fileEncoding) {
+        this.fileName = fileName;
+        this.fileEncoding = fileEncoding;
+    }
+
+
+    public Map<String, Double> read() throws IOException {
+        Map<String, Double> result = new HashMap<String, Double>();
+
+        BufferedReader bufferedReader = new BufferedReader(
+                new InputStreamReader(
+                        new FileInputStream(fileName), fileEncoding));
+        String s = bufferedReader.readLine();
+        while (s != null) {
+            String[] strings = s.split(" ");
+            Double value = Double.valueOf(strings[1]);
+            result.put(strings[2], value);
+            s = bufferedReader.readLine();
+        }
+        return result;
+    }
+}
--- a/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/GrammaReader.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/GrammaReader.java
@@ -0,0 +1,76 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.morpholgy.dictionary;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+//todo spleet this class on two.
+public class GrammaReader {
+    private String fileName;
+    private String fileEncoding = "windows-1251";
+    private List<String> grammaInfo = new ArrayList<String>();
+    private Map<String, Integer> inversIndex = new HashMap<String, Integer>();
+
+    public GrammaReader(String fileName) throws IOException {
+        this.fileName = fileName;
+        setUp();
+    }
+
+    public GrammaReader(String fileName, String fileEncoding) throws IOException {
+        this.fileName = fileName;
+        this.fileEncoding = fileEncoding;
+        setUp();
+    }
+
+    private void setUp() throws IOException {
+        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), fileEncoding));
+        String line = bufferedReader.readLine();
+        while (line != null) {
+            line = line.trim();
+            if (!line.startsWith("//") && line.length() > 0) {
+                String[] strings = line.split(" ", 2);
+                Integer i = grammaInfo.size();
+                inversIndex.put(strings[0], i);
+                grammaInfo.add(i, strings[1]);
+            }
+            line = bufferedReader.readLine();
+        }
+    }
+
+    public List<String> getGrammaInfo() {
+        return grammaInfo;
+    }
+
+    public String[] getGrammaInfoAsArray() {
+        return grammaInfo.toArray(new String[grammaInfo.size()]);
+    }
+
+    public Map<String, Integer> getGrammInversIndex() {
+        return inversIndex;
+    }
+
+    public void setInversIndex(Map<String, Integer> inversIndex) {
+        this.inversIndex = inversIndex;
+    }
+}
--- a/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/IgnoredFormReader.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/IgnoredFormReader.java
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov 
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.morpholgy.dictionary;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.HashSet;
+import java.util.Set;
+
+
+public class IgnoredFormReader {
+    private String fileName;
+    private String fileEncoding = "windows-1251";
+
+    public IgnoredFormReader(String fileName) {
+        this.fileName = fileName;
+    }
+
+    public IgnoredFormReader(String fileName, String fileEncoding) {
+        this.fileName = fileName;
+        this.fileEncoding = fileEncoding;
+    }
+
+    public Set<String> getIngnoredFroms() throws IOException {
+        BufferedReader bufferedReader = new BufferedReader(
+                new InputStreamReader(
+                        new FileInputStream(fileName), fileEncoding));
+        String s = bufferedReader.readLine();
+        HashSet<String> result = new HashSet<String>();
+        while (s != null) {
+            if (!s.startsWith("//")) {
+                result.add(s.trim().split(" ")[0]);
+            }
+            s = bufferedReader.readLine();
+        }
+        return result;
+    }
+}
--- a/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/StatiticsCollector.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/StatiticsCollector.java
@@ -0,0 +1,147 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.morpholgy.dictionary;
+
+
+import org.apache.lucene.morphology.Heuristic;
+import org.apache.lucene.morphology.LetterDecoderEncoder;
+import org.apache.lucene.morphology.Morph;
+
+import java.io.IOException;
+import java.util.*;
+
+
+public class StatiticsCollector implements WordProccessor {
+    private TreeMap<String, Set<Heuristic>> inversIndex = new TreeMap<String, Set<Heuristic>>();
+    private Map<Set<Heuristic>, Integer> ruleInverIndex = new HashMap<Set<Heuristic>, Integer>();
+    private List<Set<Heuristic>> rules = new ArrayList<Set<Heuristic>>();
+    private GrammaReader grammaReader;
+    private LetterDecoderEncoder decoderEncoder;
+
+
+    public StatiticsCollector(GrammaReader grammaReader) {
+        this.grammaReader = grammaReader;
+    }
+
+    public void proccess(WordCard wordCard) throws IOException {
+        wordCard = cleanWordCard(wordCard);
+        String normalStringMorph = wordCard.getWordsFroms().get(0).getCode();
+        String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
+        if (word.contains("-")) return;
+
+        for (FlexiaModel fm : wordCard.getWordsFroms()) {
+            Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph);
+            String form = revertWord(fm.create(wordCard.getBase()));
+            Set<Heuristic> suffixHeuristics = inversIndex.get(form);
+            if (suffixHeuristics == null) {
+                suffixHeuristics = new HashSet<Heuristic>();
+                inversIndex.put(form, suffixHeuristics);
+            }
+            suffixHeuristics.add(heuristic);
+        }
+    }
+
+    private WordCard cleanWordCard(WordCard wordCard) {
+        return wordCard;
+    }
+
+
+    public void saveHeuristic() throws IOException {
+
+        Map<Integer, Integer> dist = new TreeMap<Integer, Integer>();
+        Set<Heuristic> prevSet = null;
+        int count = 0;
+        for (String key : inversIndex.keySet()) {
+            Set<Heuristic> currentSet = inversIndex.get(key);
+            if (!currentSet.equals(prevSet)) {
+                Integer d = dist.get(key.length());
+                dist.put(key.length(), 1 + (d == null ? 0 : d));
+                prevSet = currentSet;
+                count++;
+                if (!ruleInverIndex.containsKey(currentSet)) {
+                    ruleInverIndex.put(currentSet, rules.size());
+                    rules.add(currentSet);
+                }
+            }
+        }
+        System.out.println("Word with diffirent rules " + count);
+        System.out.println("All ivers words " + inversIndex.size());
+        System.out.println(dist);
+        System.out.println("diffirent rule count " + ruleInverIndex.size());
+        Heuristic[][] heuristics = new Heuristic[ruleInverIndex.size()][];
+        int index = 0;
+        for (Set<Heuristic> hs : rules) {
+            heuristics[index] = new Heuristic[hs.size()];
+            int indexj = 0;
+            for (Heuristic h : hs) {
+                heuristics[index][indexj] = h;
+                indexj++;
+            }
+            index++;
+        }
+
+        int[][] ints = new int[count][];
+        short[] rulesId = new short[count];
+        count = 0;
+        prevSet = null;
+        for (String key : inversIndex.keySet()) {
+            Set<Heuristic> currentSet = inversIndex.get(key);
+            if (!currentSet.equals(prevSet)) {
+                ints[count] = decoderEncoder.encodeToArray(key);
+                rulesId[count] = (short) ruleInverIndex.get(currentSet).intValue();
+                count++;
+                prevSet = currentSet;
+            }
+        }
+        Morph morph = new Morph(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray());
+        morph.writeToFile("sep.txt");
+    }
+
+    private String revertWord(String s) {
+        String result = "";
+        for (int i = 1; i <= s.length(); i++) {
+            result += s.charAt(s.length() - i);
+        }
+        return result;
+    }
+
+
+    private Heuristic createEvristic(String wordBase, String canonicalSuffix, FlexiaModel fm, String normalSuffixForm) {
+        String form = fm.create(wordBase);
+        String normalForm = wordBase + canonicalSuffix;
+        Integer length = getCommonLength(form, normalForm);
+        Integer actualSuffixLengh = form.length() - length;
+        String actualNormalSuffix = normalForm.substring(length);
+        Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode().substring(0, 2));
+        Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm.substring(0, 2));
+        return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue());
+    }
+
+    public static Integer getCommonLength(String s1, String s2) {
+        Integer length = Math.min(s1.length(), s2.length());
+        for (int i = 0; i < length; i++) {
+            if (s1.charAt(i) != s2.charAt(i)) return i;
+        }
+        return length;
+    }
+
+    private String cleanString(String s) {
+        return decoderEncoder.cleanString(s);
+        //return s.replace((char) (34 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
+    }
+
+}
--- a/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/WordCard.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/WordCard.java
@@ -0,0 +1,72 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov 
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.morpholgy.dictionary;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Represent word and all it forms.
+ */
+public class WordCard {
+    private String canonicalFrom;
+    private String base;
+    private String canonicalSuffix;
+    private List<FlexiaModel> wordsFroms = new ArrayList<FlexiaModel>();
+
+    public WordCard(String canonicalFrom, String base, String canonicalSuffix) {
+        this.canonicalFrom = canonicalFrom;
+        this.canonicalSuffix = canonicalSuffix;
+        this.base = base;
+    }
+
+    public void addFlexia(FlexiaModel flexiaModel) {
+        wordsFroms.add(flexiaModel);
+    }
+
+    public String getCanonicalFrom() {
+        return canonicalFrom;
+    }
+
+    public String getCanonicalSuffix() {
+        return canonicalSuffix;
+    }
+
+    public String getBase() {
+        return base;
+    }
+
+    public List<FlexiaModel> getWordsFroms() {
+        return wordsFroms;
+    }
+
+    public void setCanonicalFrom(String canonicalFrom) {
+        this.canonicalFrom = canonicalFrom;
+    }
+
+    public void setBase(String base) {
+        this.base = base;
+    }
+
+    public void setCanonicalSuffix(String canonicalSuffix) {
+        this.canonicalSuffix = canonicalSuffix;
+    }
+
+    public void setWordsFroms(List<FlexiaModel> wordsFroms) {
+        this.wordsFroms = wordsFroms;
+    }
+}
--- a/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/WordProccessor.java
+++ b/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/WordProccessor.java
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov 
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.morpholgy.dictionary;
+
+import java.io.IOException;
+
+/**
+ * Interface allows get information from
+ * {@org.apache.lucene.russian.morphology.dictonary.DirtonaryReader}.
+ */
+public interface WordProccessor {
+
+    public void proccess(WordCard wordCard) throws IOException;
+}