diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictonaryReader.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictonaryReader.java index ff72d7c..6503fdf 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictonaryReader.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictonaryReader.java @@ -31,9 +31,9 @@ import java.util.*; public class DictonaryReader { private String fileName; private String fileEncoding = "windows-1251"; - private List> wordsFlexias = new ArrayList>(); - private List> wordPrefixes = new ArrayList>(); - private Set ingnoredForm = new HashSet(); + protected List> wordsFlexias = new ArrayList>(); + protected List> wordPrefixes = new ArrayList>(); + protected Set ingnoredForm = new HashSet(); public DictonaryReader(String fileName, Set ingnoredForm) { this.fileName = fileName; @@ -57,7 +57,7 @@ public class DictonaryReader { } - private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException { + protected void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException { String s = reader.readLine(); int count = Integer.valueOf(s); for (int i = 0; i < count; i++) { @@ -81,7 +81,7 @@ public class DictonaryReader { } - private void sckipBlock(BufferedReader reader) throws IOException { + protected void sckipBlock(BufferedReader reader) throws IOException { String s = reader.readLine(); int count = Integer.valueOf(s); for (int i = 0; i < count; i++) { @@ -90,7 +90,7 @@ public class DictonaryReader { } - private void readPrefix(BufferedReader reader) throws IOException { + protected void readPrefix(BufferedReader reader) throws IOException { String s = reader.readLine(); int count = Integer.valueOf(s); for (int i = 0; i < count; i++) { @@ -99,7 +99,7 @@ public class DictonaryReader { } } - private void readFlexias(BufferedReader reader) throws IOException { + protected void readFlexias(BufferedReader reader) throws IOException { String s = reader.readLine(); int count = Integer.valueOf(s); for (int i = 0; i < count; i++) { @@ -112,7 +112,7 @@ public class DictonaryReader { } } - private void addFlexia(ArrayList flexiaModelArrayList, String line) { + protected void addFlexia(ArrayList flexiaModelArrayList, String line) { String[] fl = line.split("\\*"); // we inored all forms thats if (fl.length == 3) { diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java index 9b51950..a210889 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java @@ -1,5 +1,5 @@ /** - * Copyright 2009 Alexander Kuznetsov + * Copyright 2009 Alexander Kuznetsov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -60,6 +60,28 @@ public class FlexiaModel { @Override public String toString() { - return prefix + " " + suffix; + return prefix + " " + suffix + " " + code; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + FlexiaModel that = (FlexiaModel) o; + + if (code != null ? !code.equals(that.code) : that.code != null) return false; + if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false; + if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false; + + return true; + } + + @Override + public int hashCode() { + int result = code != null ? code.hashCode() : 0; + result = 31 * result + (suffix != null ? suffix.hashCode() : 0); + result = 31 * result + (prefix != null ? prefix.hashCode() : 0); + return result; } } diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/PrefixesHypotises.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/PrefixesHypotises.java new file mode 100644 index 0000000..c0d8d42 --- /dev/null +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/PrefixesHypotises.java @@ -0,0 +1,82 @@ +package org.apache.lucene.morphology.dictionary; + +import java.util.*; +import java.io.IOException; +import java.io.BufferedReader; + + +public class PrefixesHypotises extends DictonaryReader { + private Map> rules = new HashMap>(); + + public PrefixesHypotises(String fileName, Set ingnoredForm) { + super(fileName, ingnoredForm); + } + + public PrefixesHypotises(String fileName, String fileEncoding, Set ingnoredForm) { + super(fileName, fileEncoding, ingnoredForm); + } + + @Override + public void proccess(WordProccessor wordProccessor) throws IOException { + super.proccess(wordProccessor); + System.out.println(rules.size()); + System.out.println(rules); + } + + @Override + protected void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException { + sckipBlock(reader); + } + + + + @Override + protected void readPrefix(BufferedReader reader) throws IOException { + sckipBlock(reader); + } + + @Override + protected void readFlexias(BufferedReader reader) throws IOException { + super.readFlexias(reader); + //todo research flesias + for(List fmList:wordsFlexias){ + research(fmList); + } + } + + private void research(List models) { + for(FlexiaModel fm:models){ + if(fm.getPrefix().length() > 0){ + testFlexia(models, fm); + } + } + } + + private void testFlexia(List models, FlexiaModel fm) { + for(FlexiaModel com:models){ + if(com.getSuffix().equals(fm.getSuffix()) && com.getPrefix().length() == 0){ + Set models1 = rules.get(convert(fm)); + if(models1 == null){ + models1 = new HashSet(); + rules.put(convert(fm),models1); + } + models1.add(convert(com)); + } + } + } + + private FlexiaModel convert(FlexiaModel fm){ + String suf = fm.getSuffix(); + if(suf.length() == 1) System.out.println(fm); + return new FlexiaModel(fm.getCode(),""+ suf.charAt(suf.length()-1)+ (suf.length() > 1 ? suf.charAt(suf.length()-2) : ""),fm.getPrefix()); + } + + protected void addFlexia(ArrayList flexiaModelArrayList, String line) { + String[] fl = line.split("\\*"); + if (fl.length == 3) { + flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase())); + } + if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), "")); + } + +} diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianPrefixesBuilder.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianPrefixesBuilder.java new file mode 100644 index 0000000..dd5fc48 --- /dev/null +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianPrefixesBuilder.java @@ -0,0 +1,41 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.morphology.generator; + +import org.apache.lucene.morphology.dictionary.*; +import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder; + +import java.io.IOException; +import java.util.HashSet; + + +public class RussianPrefixesBuilder { + public static void main(String[] args) throws IOException { + GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab"); + PrefixesHypotises dictonaryReader = new PrefixesHypotises("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet()); + + //RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder(); + //StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo, decoderEncoder); + dictonaryReader.proccess(new WordProccessor(){ + public void proccess(WordCard wordCard) throws IOException { + //To change body of implemented methods use File | Settings | File Templates. + } + }); + //statiticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info"); + + } +} \ No newline at end of file diff --git a/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-morphology-test.txt b/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-morphology-test.txt index c775e7d..fdf34da 100644 --- a/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-morphology-test.txt +++ b/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-morphology-test.txt @@ -16,4 +16,6 @@ тосклив тоскливый лучший хороший на на -тест тест тесто \ No newline at end of file +тест тест тесто +спам спам +спама спам \ No newline at end of file