rallback on wrong version of morphology, adding interafce for morphology

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@88 d817d54c-26ab-11de-abc9-2f7d1455ff7a
2009-11-17 14:03:59 +00:00
parent 16613c543b
commit 1273cf96ed
19 changed files with 263 additions and 1145 deletions
@@ -31,9 +31,9 @@ import java.util.*;
 public class DictonaryReader {
    private String fileName;
    private String fileEncoding = "windows-1251";
-    protected List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
+    private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
-    protected List<List<String>> wordPrefixes = new ArrayList<List<String>>();
+    private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
-    protected Set<String> ingnoredForm = new HashSet<String>();
+    private Set<String> ingnoredForm = new HashSet<String>();
    public DictonaryReader(String fileName, Set<String> ingnoredForm) {
        this.fileName = fileName;
@@ -57,7 +57,7 @@ public class DictonaryReader {
    }
-    protected void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
+    private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
        String s = reader.readLine();
        int count = Integer.valueOf(s);
        for (int i = 0; i < count; i++) {
@@ -81,7 +81,7 @@ public class DictonaryReader {
    }
-    protected void sckipBlock(BufferedReader reader) throws IOException {
+    private void sckipBlock(BufferedReader reader) throws IOException {
        String s = reader.readLine();
        int count = Integer.valueOf(s);
        for (int i = 0; i < count; i++) {
@@ -90,7 +90,7 @@ public class DictonaryReader {
    }
-    protected void readPrefix(BufferedReader reader) throws IOException {
+    private void readPrefix(BufferedReader reader) throws IOException {
        String s = reader.readLine();
        int count = Integer.valueOf(s);
        for (int i = 0; i < count; i++) {
@@ -99,7 +99,7 @@ public class DictonaryReader {
        }
    }
-    protected  void readFlexias(BufferedReader reader) throws IOException {
+    private void readFlexias(BufferedReader reader) throws IOException {
        String s = reader.readLine();
        int count = Integer.valueOf(s);
        for (int i = 0; i < count; i++) {
@@ -112,7 +112,7 @@ public class DictonaryReader {
        }
    }
-    protected  void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
+    private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
        String[] fl = line.split("\\*");
        // we inored all forms thats
        if (fl.length == 3) {
@@ -60,28 +60,6 @@ public class FlexiaModel {
    @Override
    public String toString() {
-        return prefix + " " + suffix + " " + code;
+        return prefix + " " + suffix;
    }
    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;
        FlexiaModel that = (FlexiaModel) o;
        if (code != null ? !code.equals(that.code) : that.code != null) return false;
        if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false;
        if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false;
        return true;
    }
    @Override
    public int hashCode() {
        int result = code != null ? code.hashCode() : 0;
        result = 31 * result + (suffix != null ? suffix.hashCode() : 0);
        result = 31 * result + (prefix != null ? prefix.hashCode() : 0);
        return result;
    }
 }
@@ -1,139 +0,0 @@
 /**
 * Copyright 2009 Alexander Kuznetsov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.morphology.dictionary;
 import org.apache.lucene.morphology.PrefixRule;
 import java.util.*;
 import java.io.*;
 public class PrefixesRulesBuilder extends DictonaryReader {
    private GrammaReader grammaInfo;
    private Map<FlexiaModel,Set<FlexiaModel>> rules = new HashMap<FlexiaModel,Set<FlexiaModel>>();
    public PrefixesRulesBuilder(String fileName, String fileEncoding, Set<String> ingnoredForm) throws IOException {
        super(fileName, fileEncoding, ingnoredForm);
        grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
    }
    @Override
    public void proccess(WordProccessor wordProccessor) throws IOException {
        super.proccess(wordProccessor);
        System.out.println(rules.size());
        System.out.println(rules);
    }
    public List<PrefixRule> getPrefixRules(){
        List<PrefixRule> prefixRules = new ArrayList<PrefixRule>();
        for(FlexiaModel key:rules.keySet()){
            PrefixRule prefixRule = new PrefixRule();
            prefixRule.setPrefix(key.getPrefix());
            prefixRule.setLastLetter(key.getSuffix().charAt(0));
            HashSet<Short> map = new HashSet<Short>();
            for(FlexiaModel fm:rules.get(key)){
                int gi = grammaInfo.getGrammInversIndex().get(fm.getCode());
                map.add((short) gi);
            }
            prefixRule.setForms(map);
            prefixRules.add(prefixRule);
        }
        return prefixRules;
    }
    @Override
    protected void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
        sckipBlock(reader);
    }
    @Override
    protected void readPrefix(BufferedReader reader) throws IOException {
        sckipBlock(reader);
    }
    @Override
    protected void readFlexias(BufferedReader reader) throws IOException {
        super.readFlexias(reader);
        //todo research flesias
        for(List<FlexiaModel> fmList:wordsFlexias){
            research(fmList);
        }
    }
    private void research(List<FlexiaModel> models) {
        for(FlexiaModel fm:models){
            if(fm.getPrefix().length() > 0){
                testFlexia(models, fm);
            }
        }
    }
    private void testFlexia(List<FlexiaModel> models, FlexiaModel fm) {
        for(FlexiaModel com:models){
            if(com.getSuffix().equals(fm.getSuffix()) && com.getPrefix().length() == 0){
                Set<FlexiaModel> models1 = rules.get(convertForKey(fm));
                if(models1 == null){
                    models1 = new HashSet<FlexiaModel>();
                    rules.put(convertForKey(fm),models1);
                }
                models1.add(convert(com));
            }
        }
    }
    private FlexiaModel convert(FlexiaModel fm){
        String suf = fm.getSuffix();
        //if(suf.length() == 1) System.out.println(fm);
        return new FlexiaModel(fm.getCode(),""+ suf.charAt(suf.length()-1),fm.getPrefix());
    }
    private FlexiaModel convertForKey(FlexiaModel fm){
        String suf = fm.getSuffix();
        //if(suf.length() == 1) System.out.println(fm);
        return new FlexiaModel("pr",""+ suf.charAt(suf.length()-1),fm.getPrefix());
    }
    protected void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
        String[] fl = line.split("\\*");
        if (fl.length == 3) {
            flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
        }
        if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
    }
    public void savePrefixes(String fileName) throws IOException {
        OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8");
        List<PrefixRule> prefixRuleList = getPrefixRules();
        writer.write(prefixRuleList.size()+"\n");
        for(PrefixRule pr: prefixRuleList){
            writePrefixRule(writer, pr);
        }
        writer.close();
    }
    private void writePrefixRule(OutputStreamWriter writer, PrefixRule pr) throws IOException {
        writer.write(pr.getPrefix()+"\n");
        writer.write(pr.getLastLetter()+"\n");
        HashSet<Short> formInfo = pr.getForms();
        writer.write(formInfo.size()+"\n");
        for(Short s:formInfo){
            writer.write(s+"\n");
        }
    }
 }
@@ -19,7 +19,7 @@ package org.apache.lucene.morphology.dictionary;
 import org.apache.lucene.morphology.Heuristic;
 import org.apache.lucene.morphology.LetterDecoderEncoder;
-import org.apache.lucene.morphology.Morphology;
+import org.apache.lucene.morphology.MorphologyImpl;
 import java.io.IOException;
 import java.util.*;
@@ -119,7 +119,7 @@ public class StatiticsCollector implements WordProccessor {
                prevSet = currentSet;
            }
        }
-        Morphology morphology = new Morphology(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray());
+        MorphologyImpl morphology = new MorphologyImpl(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray());
        morphology.writeToFile(fileName);
    }
@@ -1,39 +0,0 @@
 /**
 * Copyright 2009 Alexander Kuznetsov 
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.morphology.generator;
 import org.apache.lucene.morphology.dictionary.*;
 import java.io.IOException;
 import java.util.HashSet;
 public class RussianPrefixesBuilder {
    public static void main(String[] args) throws IOException {
        PrefixesRulesBuilder dictonaryReader = new PrefixesRulesBuilder("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", "windows-1251",new HashSet<String>());
        dictonaryReader.proccess(new WordProccessor() {
            public void proccess(WordCard wordCard) throws IOException {
            }
        });
        dictonaryReader.savePrefixes("russian/src/main/resources/org/apache/lucene/morphology/russian/prefixes.info");
    }
 }
@@ -1,372 +0,0 @@
 [ ть
  у
  ем
  ешь
  ете
  ет
  ут
  ла
  ло
  ли
  я
  ши
  ем
  емте
 по ай
  ь
 по айте
  ьте
  ущий
  ущего
  ущему
  ущего
  ущий
  ущим
  ущем
  ущая
  ущей
  ущей
  ущую
  ущей
  ущею
  ущей
  ущее
  ущего
  ущему
  ущее
  ущим
  ущем
  ущие
  ущих
  ущим
  ущих
  ущие
  ущими
  ущих
  ший
  шего
  шему
  шего
  ший
  шим
  шем
  шая
  шей
  шей
  шую
  шей
  шею
  шей
  шее
  шего
  шему
  шее
  шим
  шем
  шие
  ших
  шим
  ших
  шие
  шими
  ших]
 [ большой
  большого
  большому
  большого
  большой
  большим
  большом
  большая
  большой
  большой
  большую
  большой
  большою
  большой
  большое
  большого
  большому
  большое
  большим
  большом
  большие
  больших
  большим
  больших
  большие
  большими
  больших
  велик
  велика
  велико
  велики
  больше
 по больше
 наи больший
 наи большего
 наи большему
 наи большего
 наи больший
 наи большим
 наи большем
 наи большая
 наи большей
 наи большей
 наи большую
 наи большей
 наи большею
 наи большей
 наи большее
 наи большего
 наи большему
 наи большее
 наи большим
 наи большем
 наи большие
 наи больших
 наи большим
 наи больших
 наи большие
 наи большими
 наи больших]
 [ вероятный
  вероятного
  вероятному
  вероятного
  вероятный
  вероятным
  вероятном
  вероятная
  вероятной
  вероятной
  вероятную
  вероятной
  вероятною
  вероятной
  вероятное
  вероятного
  вероятному
  вероятное
  вероятным
  вероятном
  вероятные
  вероятных
  вероятным
  вероятных
  вероятные
  вероятными
  вероятных
  вероятен
  вероятна
  вероятно
  вероятны
  вероятнее
  вероятней
 по вероятнее
 по вероятней
  вероятнейший
 наи невероятнейший
  вероятнейшего
 наи невероятнейшего
  вероятнейшему
 наи невероятнейшему
  вероятнейшего
 наи невероятнейшего
  вероятнейший
 наи невероятнейший
  вероятнейшим
 наи невероятнейшим
  вероятнейшем
 наи невероятнейшем
  вероятнейшая
 наи невероятнейшая
  вероятнейшей
 наи невероятнейшей
  вероятнейшей
 наи невероятнейшей
  вероятнейшую
 наи невероятнейшую
  вероятнейшей
  вероятнейшею
 наи невероятнейшей
 наи невероятнейшею
  вероятнейшей
 наи невероятнейшей
  вероятнейшее
 наи невероятнейшее
  вероятнейшего
 наи невероятнейшего
  вероятнейшему
 наи невероятнейшему
  вероятнейшее
 наи невероятнейшее
  вероятнейшим
 наи невероятнейшим
  вероятнейшем
 наи невероятнейшем
  вероятнейшие
 наи невероятнейшие
  вероятнейших
 наи невероятнейших
  вероятнейшим
 наи невероятнейшим
  вероятнейших
 наи невероятнейших
  вероятнейшие
 наи невероятнейшие
  вероятнейшими
 наи невероятнейшими
  вероятнейших
 наи невероятнейших]
 [ аленький
  аленького
  аленькому
  аленького
  аленький
  аленьким
  аленьком
  аленькая
  аленькой
  аленькой
  аленькую
  аленькой
  аленькою
  аленькой
  аленькое
  аленького
  аленькому
  аленькое
  аленьким
  аленьком
  аленькие
  аленьких
  аленьким
  аленьких
  аленькие
  аленькими
  аленьких
  ал
  ала
  ало
  алы
  еньше
 по еньше
  алейший
 наи еньший
  алейшего
 наи еньшего
  алейшему
 наи еньшему
  алейшего
 наи еньшего
  алейший
 наи еньший
  алейшим
 наи еньшим
  алейшем
 наи еньшем
  алейшая
 наи еньшая
  алейшей
 наи еньшей
  алейшей
 наи еньшей
  алейшую
 наи еньшую
  алейшей
  алейшею
 наи еньшей
 наи еньшею
  алейшей
 наи еньшей
  алейшее
 наи еньшее
  алейшего
 наи еньшего
  алейшему
 наи еньшему
  алейшее
 наи еньшее
  алейшим
 наи еньшим
  алейшем
 наи еньшем
  алейшие
 наи еньшие
  алейших
 наи еньших
  алейшим
 наи еньшим
  алейших
 наи еньших
  алейшие
 наи еньшие
  алейшими
 наи еньшими
  алейших
 наи еньших]
 [ ьный
  ьного
  ьному
  ьного
  ьный
  ьным
  ьном
  ьная
  ьной
  ьной
  ьную
  ьной
  ьною
  ьной
  ьное
  ьного
  ьному
  ьное
  ьным
  ьном
  ьные
  ьных
  ьным
  ьных
  ьные
  ьными
  ьных
  ен
  ьна
  ьно
  ьны
  ьны
  ьнее
  ьней
 по ьнее
 по ьней
 наи ьнейший
 наи ьнейшего
 наи ьнейшему
 наи ьнейшего
 наи ьнейший
 наи ьнейшим
 наи ьнейшем
 наи ьнейшая
 наи ьнейшей
 наи ьнейшей
 наи ьнейшую
 наи ьнейшей
 наи ьнейшею
 наи ьнейшей
 наи ьнейшее
 наи ьнейшего
 наи ьнейшему
 наи ьнейшее
 наи ьнейшим
 наи ьнейшем
 наи ьнейшие
 наи ьнейших
 наи ьнейшим
 наи ьнейших
 наи ьнейшие
 наи ьнейшими
 наи ьнейших]
@@ -15,12 +15,12 @@
 */
 package org.apache.lucene.morphology.english;
-import org.apache.lucene.morphology.Morphology;
+import org.apache.lucene.morphology.MorphologyImpl;
 import java.io.IOException;
-public class EnglishMorphology extends Morphology {
+public class EnglishMorphology extends MorphologyImpl {
    public EnglishMorphology() throws IOException {
        super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
@@ -23,7 +23,7 @@ import java.util.ArrayList;
 import java.util.List;
-public class LuceneMorphology extends MorphologyWithPrefix {
+public class LuceneMorphology extends MorphologyImpl {
    public LuceneMorphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
        super(fileName, decoderEncoder);
@@ -33,13 +33,15 @@ public class LuceneMorphology extends MorphologyWithPrefix {
        super(inputStream, decoderEncoder);
    }
    public LuceneMorphology(InputStream morphFormInputStream, InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
        super(morphFormInputStream, prefixesInputStream, decoderEncoder);
    }
    @Override
-    protected String createForm(String form, String grammaInfo) {
+    public List<String> getMorhInfo(String s) {
-        return form;
+        ArrayList<String> result = new ArrayList<String>();
        int[] ints = decoderEncoder.encodeToArray(revertWord(s));
        int ruleId = findRuleId(ints);
        for (Heuristic h : rules[rulesId[ruleId]]) {
            result.add(h.transofrmWord(s));
        }
        return result;
    }
    protected void readRules(BufferedReader bufferedReader) throws IOException {
@@ -15,200 +15,11 @@
 */
 package org.apache.lucene.morphology;
 import java.io.*;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
-public class Morphology {
+public interface Morphology {
    protected int[][] separators;
    protected short[] rulesId;
    protected Heuristic[][] rules;
    protected String[] grammaInfo;
    protected LetterDecoderEncoder decoderEncoder;
-
+    List<String> getMorhInfo(String s);
    public Morphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
        readFromFile(fileName);
        this.decoderEncoder = decoderEncoder;
    }
    public Morphology(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
        readFromInputStream(inputStream);
        this.decoderEncoder = decoderEncoder;
    }
    public Morphology(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
        this.separators = separators;
        this.rulesId = rulesId;
        this.rules = rules;
        this.grammaInfo = grammaInfo;
    }
    public int[][] getSeparators() {
        return separators;
    }
    public short[] getRulesId() {
        return rulesId;
    }
    public Heuristic[][] getRules() {
        return rules;
    }
    public String[] getGrammaInfo() {
        return grammaInfo;
    }
    public List<String> getMorhInfo(String s) {
        ArrayList<String> result = new ArrayList<String>();
        int[] ints = decoderEncoder.encodeToArray(revertWord(s));
        int ruleId = findRuleId(ints);
        for (Heuristic h : rules[rulesId[ruleId]]) {
            result.add(createForm(h.transofrmWord(s),grammaInfo[h.getFormMorphInfo()]));
        }
        return result;
    }
    protected String createForm(String form,String grammaInfo){
        return form+"|"+grammaInfo;
    }
    protected int findRuleId(int[] ints) {
        int low = 0;
        int high = separators.length - 1;
        int mid = 0;
        while (low <= high) {
            mid = (low + high) >>> 1;
            int[] midVal = separators[mid];
            int comResult = compareToInts(ints, midVal);
            if (comResult > 0)
                low = mid + 1;
            else if (comResult < 0)
                high = mid - 1;
            else
                break;
        }
        if (compareToInts(ints, separators[mid]) >= 0) {
            return mid;
        } else {
            return mid - 1;
        }
 }
    private int compareToInts(int[] i1, int[] i2) {
        int minLength = Math.min(i1.length, i2.length);
        for (int i = 0; i < minLength; i++) {
            int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1);
            if (i3 != 0) return i3;
        }
        return i1.length - i2.length;
    }
    public void writeToFile(String fileName) throws IOException {
        OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8");
        writer.write(separators.length + "\n");
        for (int[] i : separators) {
            writer.write(i.length + "\n");
            for (int j : i) {
                writer.write(j + "\n");
            }
        }
        for (short i : rulesId) {
            writer.write(i + "\n");
        }
        writer.write(rules.length + "\n");
        for (Heuristic[] heuristics : rules) {
            writer.write(heuristics.length + "\n");
            for (Heuristic heuristic : heuristics) {
                writer.write(heuristic.toString() + "\n");
            }
        }
        writer.write(grammaInfo.length + "\n");
        for (String s : grammaInfo) {
            writer.write(s + "\n");
        }
        writer.close();
    }
    public void readFromFile(String fileName) throws IOException {
        FileInputStream inputStream = new FileInputStream(fileName);
        readFromInputStream(inputStream);
    }
    private void readFromInputStream(InputStream inputStream) throws IOException {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
        String s = bufferedReader.readLine();
        Integer amount = Integer.valueOf(s);
        readSeparators(bufferedReader, amount);
        readRulesId(bufferedReader, amount);
        readRules(bufferedReader);
        readGrammaInfo(bufferedReader);
        bufferedReader.close();
    }
    private void readGrammaInfo(BufferedReader bufferedReader) throws IOException {
        String s;
        Integer amount;
        s = bufferedReader.readLine();
        amount = Integer.valueOf(s);
        grammaInfo = new String[amount];
        for (int i = 0; i < amount; i++) {
            grammaInfo[i] = bufferedReader.readLine();
        }
    }
    protected void readRules(BufferedReader bufferedReader) throws IOException {
        String s;
        Integer amount;
        s = bufferedReader.readLine();
        amount = Integer.valueOf(s);
        rules = new Heuristic[amount][];
        for (int i = 0; i < amount; i++) {
            String s1 = bufferedReader.readLine();
            Integer ruleLenght = Integer.valueOf(s1);
            rules[i] = new Heuristic[ruleLenght];
            for (int j = 0; j < ruleLenght; j++) {
                rules[i][j] = new Heuristic(bufferedReader.readLine());
            }
        }
    }
    private void readRulesId(BufferedReader bufferedReader, Integer amount) throws IOException {
        rulesId = new short[amount];
        for (int i = 0; i < amount; i++) {
            String s1 = bufferedReader.readLine();
            rulesId[i] = Short.valueOf(s1);
        }
    }
    private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException {
        HashSet intetger = new HashSet<Integer>();
        separators = new int[amount][];
        for (int i = 0; i < amount; i++) {
            String s1 = bufferedReader.readLine();
            Integer wordLenght = Integer.valueOf(s1);
            separators[i] = new int[wordLenght];
            for (int j = 0; j < wordLenght; j++) {
                separators[i][j] = Integer.valueOf(bufferedReader.readLine());
            }
            intetger.add(separators[i][0]);
        }
    }
    protected String revertWord(String s) {
        String result = "";
        for (int i = 1; i <= s.length(); i++) {
            result += s.charAt(s.length() - i);
        }
        return result;
    }
 }
@@ -0,0 +1,210 @@
 /**
 * Copyright 2009 Alexander Kuznetsov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.morphology;
 import java.io.*;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 public class MorphologyImpl implements Morphology {
    protected int[][] separators;
    protected short[] rulesId;
    protected Heuristic[][] rules;
    protected String[] grammaInfo;
    protected LetterDecoderEncoder decoderEncoder;
    public MorphologyImpl(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
        readFromFile(fileName);
        this.decoderEncoder = decoderEncoder;
    }
    public MorphologyImpl(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
        readFromInputStream(inputStream);
        this.decoderEncoder = decoderEncoder;
    }
    public MorphologyImpl(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
        this.separators = separators;
        this.rulesId = rulesId;
        this.rules = rules;
        this.grammaInfo = grammaInfo;
    }
    public int[][] getSeparators() {
        return separators;
    }
    public short[] getRulesId() {
        return rulesId;
    }
    public Heuristic[][] getRules() {
        return rules;
    }
    public String[] getGrammaInfo() {
        return grammaInfo;
    }
    public List<String> getMorhInfo(String s) {
        ArrayList<String> result = new ArrayList<String>();
        int[] ints = decoderEncoder.encodeToArray(revertWord(s));
        int ruleId = findRuleId(ints);
        for (Heuristic h : rules[rulesId[ruleId]]) {
            result.add(h.transofrmWord(s) + "|" + grammaInfo[h.getFormMorphInfo()]);
        }
        return result;
    }
    protected int findRuleId(int[] ints) {
        int low = 0;
        int high = separators.length - 1;
        int mid = 0;
        while (low <= high) {
            mid = (low + high) >>> 1;
            int[] midVal = separators[mid];
            int comResult = compareToInts(ints, midVal);
            if (comResult > 0)
                low = mid + 1;
            else if (comResult < 0)
                high = mid - 1;
            else
                break;
        }
        if (compareToInts(ints, separators[mid]) >= 0) {
            return mid;
        } else {
            return mid - 1;
        }
    }
    private int compareToInts(int[] i1, int[] i2) {
        int minLength = Math.min(i1.length, i2.length);
        for (int i = 0; i < minLength; i++) {
            int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1);
            if (i3 != 0) return i3;
        }
        return i1.length - i2.length;
    }
    public void writeToFile(String fileName) throws IOException {
        OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8");
        writer.write(separators.length + "\n");
        for (int[] i : separators) {
            writer.write(i.length + "\n");
            for (int j : i) {
                writer.write(j + "\n");
            }
        }
        for (short i : rulesId) {
            writer.write(i + "\n");
        }
        writer.write(rules.length + "\n");
        for (Heuristic[] heuristics : rules) {
            writer.write(heuristics.length + "\n");
            for (Heuristic heuristic : heuristics) {
                writer.write(heuristic.toString() + "\n");
            }
        }
        writer.write(grammaInfo.length + "\n");
        for (String s : grammaInfo) {
            writer.write(s + "\n");
        }
        writer.close();
    }
    public void readFromFile(String fileName) throws IOException {
        FileInputStream inputStream = new FileInputStream(fileName);
        readFromInputStream(inputStream);
    }
    private void readFromInputStream(InputStream inputStream) throws IOException {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
        String s = bufferedReader.readLine();
        Integer amount = Integer.valueOf(s);
        readSeparators(bufferedReader, amount);
        readRulesId(bufferedReader, amount);
        readRules(bufferedReader);
        readGrammaInfo(bufferedReader);
        bufferedReader.close();
    }
    private void readGrammaInfo(BufferedReader bufferedReader) throws IOException {
        String s;
        Integer amount;
        s = bufferedReader.readLine();
        amount = Integer.valueOf(s);
        grammaInfo = new String[amount];
        for (int i = 0; i < amount; i++) {
            grammaInfo[i] = bufferedReader.readLine();
        }
    }
    protected void readRules(BufferedReader bufferedReader) throws IOException {
        String s;
        Integer amount;
        s = bufferedReader.readLine();
        amount = Integer.valueOf(s);
        rules = new Heuristic[amount][];
        for (int i = 0; i < amount; i++) {
            String s1 = bufferedReader.readLine();
            Integer ruleLenght = Integer.valueOf(s1);
            rules[i] = new Heuristic[ruleLenght];
            for (int j = 0; j < ruleLenght; j++) {
                rules[i][j] = new Heuristic(bufferedReader.readLine());
            }
        }
    }
    private void readRulesId(BufferedReader bufferedReader, Integer amount) throws IOException {
        rulesId = new short[amount];
        for (int i = 0; i < amount; i++) {
            String s1 = bufferedReader.readLine();
            rulesId[i] = Short.valueOf(s1);
        }
    }
    private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException {
        HashSet intetger = new HashSet<Integer>();
        separators = new int[amount][];
        for (int i = 0; i < amount; i++) {
            String s1 = bufferedReader.readLine();
            Integer wordLenght = Integer.valueOf(s1);
            separators[i] = new int[wordLenght];
            for (int j = 0; j < wordLenght; j++) {
                separators[i][j] = Integer.valueOf(bufferedReader.readLine());
            }
            intetger.add(separators[i][0]);
        }
    }
    protected String revertWord(String s) {
        String result = "";
        for (int i = 1; i <= s.length(); i++) {
            result += s.charAt(s.length() - i);
        }
        return result;
    }
 }
@@ -1,96 +0,0 @@
 /**
 * Copyright 2009 Alexander Kuznetsov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.morphology;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.BufferedReader;
 import java.io.InputStreamReader;
 import java.util.*;
 public class MorphologyWithPrefix extends Morphology {
    private Map<String, PrefixRule> prefixRuleMap = new HashMap<String, PrefixRule>();
    public MorphologyWithPrefix(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
        super(fileName, decoderEncoder);
    }
    public MorphologyWithPrefix(InputStream morphFormInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
        super(morphFormInputStream, decoderEncoder);
    }
    public MorphologyWithPrefix(InputStream morphFormInputStream,InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
        super(morphFormInputStream, decoderEncoder);
        readPrefixes(prefixesInputStream);
    }
    private void readPrefixes(InputStream inputStream) throws IOException {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
        Integer prefixAmount = Integer.parseInt(bufferedReader.readLine());
        for(int i = 0; i < prefixAmount;i++){
            PrefixRule prefixRule = readPrefix(bufferedReader);
            prefixRuleMap.put(prefixRule.getHashString(),prefixRule);
        }
        bufferedReader.close();
    }
    private PrefixRule readPrefix(BufferedReader bufferedReader) throws IOException {
        PrefixRule prefixRule = new PrefixRule();
        String s = bufferedReader.readLine();
        prefixRule.setPrefix(s);
        s = bufferedReader.readLine();
        prefixRule.setLastLetter(s.charAt(0));
        HashSet<Short> morph = new HashSet<Short>();
        int formAmount = Integer.valueOf(bufferedReader.readLine());
        for(int i = 0; i < formAmount; i++){
            morph.add(Short.valueOf(bufferedReader.readLine()));
        }
        prefixRule.setForms(morph);
        return prefixRule;
    }
    public MorphologyWithPrefix(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
        super(separators, rulesId, rules, grammaInfo);
    }
    @Override
    public List<String> getMorhInfo(String s) {
        if (prefixRuleMap.size() == 0 || s.length() < 4) {
            return super.getMorhInfo(s);
        }
        String ruleIndex = "" + s.charAt(0) + s.charAt(s.length() - 1);
        PrefixRule prefixRule = prefixRuleMap.get(ruleIndex);
        if (prefixRule == null) {
            return super.getMorhInfo(s);
        }
        if (!s.startsWith(prefixRule.getPrefix())) {
            return super.getMorhInfo(s);
        }
        String sWithoutPrefix = s.substring(prefixRule.getPrefix().length());
        int[] ints = decoderEncoder.encodeToArray(revertWord(sWithoutPrefix));
        int ruleId = findRuleId(ints);
         ArrayList<String> result = new ArrayList<String>();
        for (Heuristic h : rules[rulesId[ruleId]]) {
            //String morphInfo = grammaInfo[];
            if(prefixRule.getForms().contains(h.getFormMorphInfo())){
                result.add(createForm(h.transofrmWord(sWithoutPrefix),"pr"));
            }
        }
        return result.size() > 0 ? result : super.getMorhInfo(s);
    }
 }
@@ -1,76 +0,0 @@
 /**
 * Copyright 2009 Alexander Kuznetsov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.morphology;
 import java.io.Serializable;
 import java.util.HashSet;
 public class PrefixRule implements Serializable {
    private Character lastLetter;
    private String prefix;
    private HashSet<Short> forms;
    public Character getLastLetter() {
        return lastLetter;
    }
    public void setLastLetter(Character lastLetter) {
        this.lastLetter = lastLetter;
    }
    public String getPrefix() {
        return prefix;
    }
    public void setPrefix(String prefix) {
        this.prefix = prefix;
    }
    public HashSet<Short> getForms() {
        return forms;
    }
    public void setForms(HashSet<Short> forms) {
        this.forms = forms;
    }
    public String getHashString() {
        return "" + prefix.charAt(0) + lastLetter;
    }
    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;
        PrefixRule that = (PrefixRule) o;
        if (forms != null ? !forms.equals(that.forms) : that.forms != null) return false;
        if (lastLetter != null ? !lastLetter.equals(that.lastLetter) : that.lastLetter != null) return false;
        if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false;
        return true;
    }
    @Override
    public int hashCode() {
        int result = lastLetter != null ? lastLetter.hashCode() : 0;
        result = 31 * result + (prefix != null ? prefix.hashCode() : 0);
        result = 31 * result + (forms != null ? forms.hashCode() : 0);
        return result;
    }
 }
@@ -22,6 +22,6 @@ import java.io.IOException;
 public class RussianLuceneMorphology extends LuceneMorphology {
    public RussianLuceneMorphology() throws IOException {
-        super(RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"),RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/prefixes.info"), new RussianLetterDecoderEncoder());
+        super(RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());
    }
 }
@@ -15,11 +15,11 @@
 */
 package org.apache.lucene.morphology.russian;
-import org.apache.lucene.morphology.Morphology;
+import org.apache.lucene.morphology.MorphologyImpl;
 import java.io.IOException;
-public class RussianMorphology extends Morphology {
+public class RussianMorphology extends MorphologyImpl {
    public RussianMorphology() throws IOException {
        super(RussianMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());
@@ -1,60 +0,0 @@
 /**
 * Copyright 2009 Alexander Kuznetsov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.morphology.russian;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Token;
 import java.io.IOException;
 import java.io.FileInputStream;
 import java.io.InputStreamReader;
 import java.util.HashSet;
 /**
 * Created by IntelliJ IDEA.
 * User: akuznetsov
 * Date: 31.10.2009
 * Time: 14:01:11
 * To change this template use File | Settings | File Templates.
 */
 public class TestSpeed {
    public static void main(String[] args) throws IOException {
        RussianAnalayzer russianAnalayzer = new RussianAnalayzer();
        bookProccess(russianAnalayzer, "C:/tmp/_Aleksandr_Suhov_Tanets_na_raskalennyih_uglyah1.fb2");
        Long stat = System.currentTimeMillis();
        bookProccess(russianAnalayzer, "C:/tmp/_Aleksandr_Suhov_Tanets_na_raskalennyih_uglyah1.fb2");
        System.out.println("Done in " + (System.currentTimeMillis() - stat));
    }
    private static void bookProccess(RussianAnalayzer russianAnalayzer, String bookName) throws IOException {
        FileInputStream inputStream = new FileInputStream(bookName);
        TokenStream tokenStream = russianAnalayzer.tokenStream(null,new InputStreamReader(inputStream,"UTF-8"));
        final Token reusableToken = new Token();
        long count = 0;
        Token nextToken;
        for (; ;) {
            nextToken = tokenStream.next(reusableToken);
           // System.out.println(" " + nextToken.term());
            count++;
            if (nextToken == null) {
                break;
            }
        }
        //System.out.println("Words " + count);
    }
 }
@@ -1,96 +0,0 @@
 11
 наи
 е
 8
 258
 255
 289
 252
 292
 262
 296
 286
 наи
 и
 2
 263
 297
 наи
 ю
 4
 250
 249
 283
 284
 по
 й
 5
 250
 251
 248
 247
 269
 по
 е
 3
 255
 252
 269
 наи
 й
 12
 239
 273
 250
 251
 248
 277
 247
 282
 281
 243
 285
 284
 наи
 о
 6
 274
 253
 276
 287
 242
 240
 наи
 м
 10
 256
 290
 257
 291
 279
 278
 294
 260
 244
 245
 наи
 х
 6
 259
 293
 261
 295
 264
 298
 наи
 я
 2
 246
 280
 наи
 у
 4
 275
 254
 288
 241
@@ -33,7 +33,7 @@ public class RussianLuceneMorphTest {
    @Before
    public void setUp() throws IOException {
-        luceneMorph = new RussianLuceneMorphology();
+        luceneMorph = new LuceneMorphology(this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());
    }
    @Test
@@ -1,4 +1,3 @@
 наилучший хороший
 еду еда ехать
 тестов тест
 вина вино вина
@@ -18,7 +17,3 @@
 лучший хороший
 на на
 тест тест тесто
 спам спам
 спама спам
 наигранный наигранный
 наивный наивный
-наи
-е
-наи
-и
-наи
-ю
-по
-й
-по
-е
-наи
-й
-наи
-о
-наи
-м
-наи
-х
-наи
-я
-наи
-у