adding modules

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@49 d817d54c-26ab-11de-abc9-2f7d1455ff7a
2009-10-02 16:25:08 +00:00
parent 786ce92ae0
commit 710384987c
36 changed files with 221 additions and 695427 deletions
--- a/morph/pom.xml
+++ b/morph/pom.xml
@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+<project>
+    <parent>
+        <artifactId>morpholgy</artifactId>
+        <groupId>org.apache.lucene.morpholgy</groupId>
+        <version>0.7-SNAPSHOT</version>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+    <groupId>org.apache.lucene.morpholgy</groupId>
+    <artifactId>morph</artifactId>
+    <name>morph</name>
+    <version>0.7-SNAPSHOT</version>
+    <url>http://maven.apache.org</url>
+
+    <dependencies>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>3.8.1</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+</project>
--- a/morph/src/main/java/org/apache/lucene/morphology/Heuristic.java
+++ b/morph/src/main/java/org/apache/lucene/morphology/Heuristic.java
@ -0,0 +1,91 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology;
+
+import java.io.Serializable;
+
+
+public class Heuristic implements Serializable {
+    byte actualSuffixLengh;
+    String actualNormalSuffix;
+    short formMorphInfo;
+    short normalFormMorphInfo;
+
+    public Heuristic(String s) {
+        String[] strings = s.split("\\|");
+        actualSuffixLengh = Byte.valueOf(strings[0]);
+        actualNormalSuffix = strings[1];
+        formMorphInfo = Short.valueOf(strings[2]);
+        normalFormMorphInfo = Short.valueOf(strings[3]);
+    }
+
+    public Heuristic(byte actualSuffixLengh, String actualNormalSuffix, short formMorphInfo, short normalFormMorphInfo) {
+        this.actualSuffixLengh = actualSuffixLengh;
+        this.actualNormalSuffix = actualNormalSuffix;
+        this.formMorphInfo = formMorphInfo;
+        this.normalFormMorphInfo = normalFormMorphInfo;
+    }
+
+    public String transofrmWord(String w) {
+        return w.substring(0, w.length() - actualSuffixLengh) + actualNormalSuffix;
+    }
+
+    public byte getActualSuffixLengh() {
+        return actualSuffixLengh;
+    }
+
+    public String getActualNormalSuffix() {
+        return actualNormalSuffix;
+    }
+
+    public short getFormMorphInfo() {
+        return formMorphInfo;
+    }
+
+    public short getNormalFormMorphInfo() {
+        return normalFormMorphInfo;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        Heuristic heuristic = (Heuristic) o;
+
+        if (actualSuffixLengh != heuristic.actualSuffixLengh) return false;
+        if (formMorphInfo != heuristic.formMorphInfo) return false;
+        if (normalFormMorphInfo != heuristic.normalFormMorphInfo) return false;
+        if (actualNormalSuffix != null ? !actualNormalSuffix.equals(heuristic.actualNormalSuffix) : heuristic.actualNormalSuffix != null)
+            return false;
+
+        return true;
+    }
+
+    @Override
+    public int hashCode() {
+        int result = (int) actualSuffixLengh;
+        result = 31 * result + (actualNormalSuffix != null ? actualNormalSuffix.hashCode() : 0);
+        result = 31 * result + (int) formMorphInfo;
+        result = 31 * result + (int) normalFormMorphInfo;
+        return result;
+    }
+
+    @Override
+    public String toString() {
+        return "" + actualSuffixLengh + "|" + actualNormalSuffix + "|" + formMorphInfo + "|" + normalFormMorphInfo;
+    }
+}
--- a/morph/src/main/java/org/apache/lucene/morphology/LetterDecoderEncoder.java
+++ b/morph/src/main/java/org/apache/lucene/morphology/LetterDecoderEncoder.java
@ -0,0 +1,31 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology;
+
+
+public interface LetterDecoderEncoder {
+    public Integer encode(String string);
+
+    public int[] encodeToArray(String s);
+
+    public String decodeArray(int[] array);
+
+    public String decode(Integer suffixN);
+
+    public boolean checkCharacter(char c);
+
+    public String cleanString(String s);
+}
--- a/morph/src/main/java/org/apache/lucene/morphology/LuceneMorph.java
+++ b/morph/src/main/java/org/apache/lucene/morphology/LuceneMorph.java
@ -0,0 +1,74 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology;
+
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+
+public class LuceneMorph extends Morph {
+    LetterDecoderEncoder decoderEncoder;
+
+    public LuceneMorph(String fileName) throws IOException {
+        super(fileName);
+    }
+
+    @Override
+    public List<String> getMorhInfo(String s) {
+        ArrayList<String> result = new ArrayList<String>();
+        int[] ints = decoderEncoder.encodeToArray(revertWord(s));
+        int ruleId = findRuleId(ints);
+        for (Heuristic h : rules[rulesId[ruleId]]) {
+            result.add(h.transofrmWord(s));
+        }
+        return result;
+    }
+
+    protected void readRules(BufferedReader bufferedReader) throws IOException {
+        String s;
+        Integer amount;
+        s = bufferedReader.readLine();
+        amount = Integer.valueOf(s);
+        rules = new Heuristic[amount][];
+        for (int i = 0; i < amount; i++) {
+            String s1 = bufferedReader.readLine();
+            Integer ruleLenght = Integer.valueOf(s1);
+            Heuristic[] heuristics = new Heuristic[ruleLenght];
+            for (int j = 0; j < ruleLenght; j++) {
+                heuristics[j] = new Heuristic(bufferedReader.readLine());
+            }
+            rules[i] = modeifyHeuristic(heuristics);
+        }
+    }
+
+
+    private Heuristic[] modeifyHeuristic(Heuristic[] heuristics) {
+        ArrayList<Heuristic> result = new ArrayList<Heuristic>();
+        for (Heuristic heuristic : heuristics) {
+            boolean isAdded = true;
+            for (Heuristic ch : result) {
+                isAdded = isAdded && !(ch.getActualNormalSuffix().equals(heuristic.getActualNormalSuffix()) && (ch.getActualSuffixLengh() == heuristic.getActualSuffixLengh()));
+            }
+            if (isAdded) {
+                result.add(heuristic);
+            }
+        }
+        return result.toArray(new Heuristic[result.size()]);
+    }
+}
--- a/morph/src/main/java/org/apache/lucene/morphology/Morph.java
+++ b/morph/src/main/java/org/apache/lucene/morphology/Morph.java
@ -0,0 +1,199 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology;
+
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+
+public class Morph {
+    protected int[][] separators;
+    protected short[] rulesId;
+    protected Heuristic[][] rules;
+    protected String[] grammaInfo;
+    LetterDecoderEncoder decoderEncoder;
+
+
+    public Morph(String fileName) throws IOException {
+        readFromFile(fileName);
+    }
+
+    public Morph(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
+        this.separators = separators;
+        this.rulesId = rulesId;
+        this.rules = rules;
+        this.grammaInfo = grammaInfo;
+    }
+
+    public int[][] getSeparators() {
+        return separators;
+    }
+
+    public short[] getRulesId() {
+        return rulesId;
+    }
+
+    public Heuristic[][] getRules() {
+        return rules;
+    }
+
+    public String[] getGrammaInfo() {
+        return grammaInfo;
+    }
+
+    public List<String> getMorhInfo(String s) {
+        ArrayList<String> result = new ArrayList<String>();
+        int[] ints = decoderEncoder.encodeToArray(revertWord(s));
+        int ruleId = findRuleId(ints);
+        for (Heuristic h : rules[rulesId[ruleId]]) {
+            result.add(h.transofrmWord(s) + "|" + grammaInfo[h.getFormMorphInfo()]);
+        }
+        return result;
+    }
+
+    protected int findRuleId(int[] ints) {
+        int low = 0;
+        int high = separators.length - 1;
+        int mid = 0;
+        while (low <= high) {
+            mid = (low + high) >>> 1;
+            int[] midVal = separators[mid];
+
+            int comResult = compareToInts(ints, midVal);
+            if (comResult > 0)
+                low = mid + 1;
+            else if (comResult < 0)
+                high = mid - 1;
+            else
+                break;
+        }
+        if (compareToInts(ints, separators[mid]) >= 0) {
+            return mid;
+        } else {
+            return mid - 1;
+        }
+
+    }
+
+    private int compareToInts(int[] i1, int[] i2) {
+        int minLength = Math.min(i1.length, i2.length);
+        for (int i = 0; i < minLength; i++) {
+            int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1);
+            if (i3 != 0) return i3;
+        }
+        return i2.length - i1.length;
+    }
+
+    public void writeToFile(String fileName) throws IOException {
+        FileWriter writer = new FileWriter(fileName);
+        writer.write(separators.length + "\n");
+        for (int[] i : separators) {
+            writer.write(i.length + "\n");
+            for (int j : i) {
+                writer.write(j + "\n");
+            }
+        }
+        for (short i : rulesId) {
+            writer.write(i + "\n");
+        }
+        writer.write(rules.length + "\n");
+        for (Heuristic[] heuristics : rules) {
+            writer.write(heuristics.length + "\n");
+            for (Heuristic heuristic : heuristics) {
+                writer.write(heuristic.toString() + "\n");
+            }
+        }
+        writer.write(grammaInfo.length + "\n");
+        for (String s : grammaInfo) {
+            writer.write(s + "\n");
+        }
+        writer.close();
+    }
+
+    public void readFromFile(String fileName) throws IOException {
+        BufferedReader bufferedReader = new BufferedReader(new FileReader(fileName));
+        String s = bufferedReader.readLine();
+        Integer amount = Integer.valueOf(s);
+
+        readSeparators(bufferedReader, amount);
+
+        readRulesId(bufferedReader, amount);
+
+        readRules(bufferedReader);
+        readGrammaInfo(bufferedReader);
+        bufferedReader.close();
+    }
+
+    private void readGrammaInfo(BufferedReader bufferedReader) throws IOException {
+        String s;
+        Integer amount;
+        s = bufferedReader.readLine();
+        amount = Integer.valueOf(s);
+        grammaInfo = new String[amount];
+        for (int i = 0; i < amount; i++) {
+            grammaInfo[i] = bufferedReader.readLine();
+        }
+    }
+
+    protected void readRules(BufferedReader bufferedReader) throws IOException {
+        String s;
+        Integer amount;
+        s = bufferedReader.readLine();
+        amount = Integer.valueOf(s);
+        rules = new Heuristic[amount][];
+        for (int i = 0; i < amount; i++) {
+            String s1 = bufferedReader.readLine();
+            Integer ruleLenght = Integer.valueOf(s1);
+            rules[i] = new Heuristic[ruleLenght];
+            for (int j = 0; j < ruleLenght; j++) {
+                rules[i][j] = new Heuristic(bufferedReader.readLine());
+            }
+        }
+    }
+
+    private void readRulesId(BufferedReader bufferedReader, Integer amount) throws IOException {
+        rulesId = new short[amount];
+        for (int i = 0; i < amount; i++) {
+            String s1 = bufferedReader.readLine();
+            rulesId[i] = Short.valueOf(s1);
+        }
+    }
+
+    private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException {
+        separators = new int[amount][];
+        for (int i = 0; i < amount; i++) {
+            String s1 = bufferedReader.readLine();
+            Integer wordLenght = Integer.valueOf(s1);
+            separators[i] = new int[wordLenght];
+            for (int j = 0; j < wordLenght; j++) {
+                separators[i][j] = Integer.valueOf(bufferedReader.readLine());
+            }
+        }
+    }
+
+    protected String revertWord(String s) {
+        String result = "";
+        for (int i = 1; i <= s.length(); i++) {
+            result += s.charAt(s.length() - i);
+        }
+        return result;
+    }
+}
--- a/morph/src/main/java/org/apache/lucene/morphology/SuffixToLongException.java
+++ b/morph/src/main/java/org/apache/lucene/morphology/SuffixToLongException.java
@ -0,0 +1,28 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov 
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.morphology;
+
+
+public class SuffixToLongException extends RuntimeException {
+
+    public SuffixToLongException() {
+    }
+
+    public SuffixToLongException(String message) {
+        super(message);
+    }
+}
--- a/morph/src/main/java/org/apache/lucene/morphology/WrongCharaterException.java
+++ b/morph/src/main/java/org/apache/lucene/morphology/WrongCharaterException.java
@ -0,0 +1,27 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov 
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.morphology;
+
+
+public class WrongCharaterException extends RuntimeException {
+    public WrongCharaterException() {
+    }
+
+    public WrongCharaterException(String message) {
+        super(message);
+    }
+}
--- a/morph/src/main/java/org/apache/lucene/morphology/analayzer/RussianMorphlogyAnalayzer.java
+++ b/morph/src/main/java/org/apache/lucene/morphology/analayzer/RussianMorphlogyAnalayzer.java
@ -0,0 +1,42 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov 
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.morphology.analayzer;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.morphology.LuceneMorph;
+
+import java.io.IOException;
+import java.io.Reader;
+
+public class RussianMorphlogyAnalayzer extends Analyzer {
+    private LuceneMorph luceneMorph;
+
+    public RussianMorphlogyAnalayzer() throws IOException {
+        luceneMorph = new LuceneMorph("sep.txt");
+    }
+
+    public TokenStream tokenStream(String fieldName, Reader reader) {
+        TokenStream result = new StandardTokenizer(reader);
+        result = new StandardFilter(result);
+        result = new LowerCaseFilter(result);
+        return new RussianMorphlogyFilter(result, luceneMorph);
+    }
+}
--- a/morph/src/main/java/org/apache/lucene/morphology/analayzer/RussianMorphlogyFilter.java
+++ b/morph/src/main/java/org/apache/lucene/morphology/analayzer/RussianMorphlogyFilter.java
@ -0,0 +1,83 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov 
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.morphology.analayzer;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.morphology.LuceneMorph;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+
+public class RussianMorphlogyFilter extends TokenFilter {
+    private LuceneMorph luceneMorph;
+
+    public RussianMorphlogyFilter(TokenStream tokenStream, LuceneMorph luceneMorph) {
+        super(tokenStream);
+        this.luceneMorph = luceneMorph;
+    }
+
+
+    private List<String> stack = new ArrayList<String>();
+    private int index = 0;
+    private Token current = null;
+
+    /**
+     * Returns the next token in the stream, or null at EOS.
+     */
+    public Token next(final Token reusableToken) throws IOException {
+        assert reusableToken != null;
+        while (index < stack.size()) { // pop from stack
+            Token nextToken = createToken(stack.get(index++), current, reusableToken);
+            if (nextToken != null) {
+                return nextToken;
+            }
+        }
+
+        Token nextToken = input.next(reusableToken);
+        if (nextToken == null) return null; // EOS; iterator exhausted
+        Character testC = nextToken.term().charAt(0);
+        if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC) {
+            return nextToken;
+        }
+        stack = luceneMorph.getMorhInfo(nextToken.term());
+        index = 0;
+        current = (Token) nextToken.clone();
+        nextToken = createToken(stack.get(index++), current, reusableToken);
+        return nextToken;
+    }
+
+    /**
+     * Creates and returns a token for the given synonym of the current input
+     * token; Override for custom (stateless or stateful) behavior, if desired.
+     *
+     * @param synonym       a synonym for the current token's term
+     * @param current       the current token from the underlying child stream
+     * @param reusableToken the token to reuse
+     * @return a new token, or null to indicate that the given synonym should be
+     *         ignored
+     */
+    protected Token createToken(String synonym, Token current, final Token reusableToken) {
+        reusableToken.reinit(current, synonym);
+        reusableToken.setTermBuffer(synonym);
+        reusableToken.setPositionIncrement(0);
+        return reusableToken;
+    }
+}
--- a/morph/src/main/java/org/apache/lucene/morphology/analayzer/russian-text.txt
+++ b/morph/src/main/java/org/apache/lucene/morphology/analayzer/russian-text.txt
@ -0,0 +1,8 @@
+пушке А бутявка волит за напушкой Сяпала Калуша по напушке и увазила бутявку И волит Калушата калушаточки Бутявка Калушата присяпали и бутявку стрямкали И подудонились А Калуша волит Бутявка то некузявая Калушата бутявку вычучили Бутявка вздребезнулась сопритюкнулась и усяпала с напушки
+А Калуша волит:
+— Бутявок не трямкают. Бутявки дюбые и зюмо-зюмо некузявые. От бутявок дудонятся.
+А бутявка волит за напушкой:
+— Калушата подудонились! Калушата подудонились! Зюмо некузявые! Пуськи бятые!
+В условиях нарастающей пурги было сделано 4 успешных захода на посадку. "Все нормально, будем рекомендовать систему к внедрению".
+Рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются
+примерно один раз в две недели. вина твоя вина мне
--- a/morph/src/main/java/org/apache/lucene/morphology/analayzer/suffix-heuristic-test-data.txt
+++ b/morph/src/main/java/org/apache/lucene/morphology/analayzer/suffix-heuristic-test-data.txt
@ -0,0 +1,9 @@
+шел идти
+турестических турестический
+отзывы отзыв
+победы победа
+поэтическая поэтический
+произошло произойти
+test test
+ананасов ананас
+встовашего встовать
--- a/morph/src/main/java/org/apache/lucene/morphology/analayzer/token-of-russian-text.txt
+++ b/morph/src/main/java/org/apache/lucene/morphology/analayzer/token-of-russian-text.txt
@ -0,0 +1,33 @@
+в
+условие
+нарастать
+пурга
+быть
+сделать
+4
+успешный
+заход
+на
+посадка
+весь
+нормальный
+быть
+рекомендовать
+система
+к
+внедрение
+рейс
+из
+кейптаун
+юар
+на
+станция
+новолазаревский
+антарктида
+совершаться
+примерно
+один
+раз
+в
+два
+неделя