diff --git a/1.txt b/1.txt deleted file mode 100644 index e69de29..0000000 diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictonaryReader.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictonaryReader.java index 6503fdf..ff72d7c 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictonaryReader.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictonaryReader.java @@ -31,9 +31,9 @@ import java.util.*; public class DictonaryReader { private String fileName; private String fileEncoding = "windows-1251"; - protected List> wordsFlexias = new ArrayList>(); - protected List> wordPrefixes = new ArrayList>(); - protected Set ingnoredForm = new HashSet(); + private List> wordsFlexias = new ArrayList>(); + private List> wordPrefixes = new ArrayList>(); + private Set ingnoredForm = new HashSet(); public DictonaryReader(String fileName, Set ingnoredForm) { this.fileName = fileName; @@ -57,7 +57,7 @@ public class DictonaryReader { } - protected void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException { + private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException { String s = reader.readLine(); int count = Integer.valueOf(s); for (int i = 0; i < count; i++) { @@ -81,7 +81,7 @@ public class DictonaryReader { } - protected void sckipBlock(BufferedReader reader) throws IOException { + private void sckipBlock(BufferedReader reader) throws IOException { String s = reader.readLine(); int count = Integer.valueOf(s); for (int i = 0; i < count; i++) { @@ -90,7 +90,7 @@ public class DictonaryReader { } - protected void readPrefix(BufferedReader reader) throws IOException { + private void readPrefix(BufferedReader reader) throws IOException { String s = reader.readLine(); int count = Integer.valueOf(s); for (int i = 0; i < count; i++) { @@ -99,7 +99,7 @@ public class DictonaryReader { } } - protected void readFlexias(BufferedReader reader) throws IOException { + private void readFlexias(BufferedReader reader) throws IOException { String s = reader.readLine(); int count = Integer.valueOf(s); for (int i = 0; i < count; i++) { @@ -112,7 +112,7 @@ public class DictonaryReader { } } - protected void addFlexia(ArrayList flexiaModelArrayList, String line) { + private void addFlexia(ArrayList flexiaModelArrayList, String line) { String[] fl = line.split("\\*"); // we inored all forms thats if (fl.length == 3) { diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java index a210889..9b51950 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java @@ -1,5 +1,5 @@ /** - * Copyright 2009 Alexander Kuznetsov + * Copyright 2009 Alexander Kuznetsov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -60,28 +60,6 @@ public class FlexiaModel { @Override public String toString() { - return prefix + " " + suffix + " " + code; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - FlexiaModel that = (FlexiaModel) o; - - if (code != null ? !code.equals(that.code) : that.code != null) return false; - if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false; - if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false; - - return true; - } - - @Override - public int hashCode() { - int result = code != null ? code.hashCode() : 0; - result = 31 * result + (suffix != null ? suffix.hashCode() : 0); - result = 31 * result + (prefix != null ? prefix.hashCode() : 0); - return result; + return prefix + " " + suffix; } } diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/PrefixesRulesBuilder.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/PrefixesRulesBuilder.java deleted file mode 100644 index 6c360e3..0000000 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/PrefixesRulesBuilder.java +++ /dev/null @@ -1,139 +0,0 @@ -/** - * Copyright 2009 Alexander Kuznetsov - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.morphology.dictionary; - -import org.apache.lucene.morphology.PrefixRule; - -import java.util.*; -import java.io.*; - - -public class PrefixesRulesBuilder extends DictonaryReader { - private GrammaReader grammaInfo; - - private Map> rules = new HashMap>(); - - public PrefixesRulesBuilder(String fileName, String fileEncoding, Set ingnoredForm) throws IOException { - super(fileName, fileEncoding, ingnoredForm); - grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab"); - } - - @Override - public void proccess(WordProccessor wordProccessor) throws IOException { - super.proccess(wordProccessor); - System.out.println(rules.size()); - System.out.println(rules); - } - - public List getPrefixRules(){ - List prefixRules = new ArrayList(); - for(FlexiaModel key:rules.keySet()){ - PrefixRule prefixRule = new PrefixRule(); - prefixRule.setPrefix(key.getPrefix()); - prefixRule.setLastLetter(key.getSuffix().charAt(0)); - HashSet map = new HashSet(); - for(FlexiaModel fm:rules.get(key)){ - int gi = grammaInfo.getGrammInversIndex().get(fm.getCode()); - map.add((short) gi); - } - prefixRule.setForms(map); - prefixRules.add(prefixRule); - } - return prefixRules; - } - - @Override - protected void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException { - sckipBlock(reader); - } - - - - @Override - protected void readPrefix(BufferedReader reader) throws IOException { - sckipBlock(reader); - } - - @Override - protected void readFlexias(BufferedReader reader) throws IOException { - super.readFlexias(reader); - //todo research flesias - for(List fmList:wordsFlexias){ - research(fmList); - } - } - - private void research(List models) { - for(FlexiaModel fm:models){ - if(fm.getPrefix().length() > 0){ - testFlexia(models, fm); - } - } - } - - private void testFlexia(List models, FlexiaModel fm) { - for(FlexiaModel com:models){ - if(com.getSuffix().equals(fm.getSuffix()) && com.getPrefix().length() == 0){ - Set models1 = rules.get(convertForKey(fm)); - if(models1 == null){ - models1 = new HashSet(); - rules.put(convertForKey(fm),models1); - } - models1.add(convert(com)); - } - } - } - - private FlexiaModel convert(FlexiaModel fm){ - String suf = fm.getSuffix(); - //if(suf.length() == 1) System.out.println(fm); - return new FlexiaModel(fm.getCode(),""+ suf.charAt(suf.length()-1),fm.getPrefix()); - } - - private FlexiaModel convertForKey(FlexiaModel fm){ - String suf = fm.getSuffix(); - //if(suf.length() == 1) System.out.println(fm); - return new FlexiaModel("pr",""+ suf.charAt(suf.length()-1),fm.getPrefix()); - } - - protected void addFlexia(ArrayList flexiaModelArrayList, String line) { - String[] fl = line.split("\\*"); - if (fl.length == 3) { - flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase())); - } - if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), "")); - } - - public void savePrefixes(String fileName) throws IOException { - OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8"); - List prefixRuleList = getPrefixRules(); - writer.write(prefixRuleList.size()+"\n"); - for(PrefixRule pr: prefixRuleList){ - writePrefixRule(writer, pr); - } - writer.close(); - } - - private void writePrefixRule(OutputStreamWriter writer, PrefixRule pr) throws IOException { - writer.write(pr.getPrefix()+"\n"); - writer.write(pr.getLastLetter()+"\n"); - HashSet formInfo = pr.getForms(); - writer.write(formInfo.size()+"\n"); - for(Short s:formInfo){ - writer.write(s+"\n"); - } - } -} diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/StatiticsCollector.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/StatiticsCollector.java index 2e7952d..32525cd 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/StatiticsCollector.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/StatiticsCollector.java @@ -19,7 +19,7 @@ package org.apache.lucene.morphology.dictionary; import org.apache.lucene.morphology.Heuristic; import org.apache.lucene.morphology.LetterDecoderEncoder; -import org.apache.lucene.morphology.Morphology; +import org.apache.lucene.morphology.MorphologyImpl; import java.io.IOException; import java.util.*; @@ -119,7 +119,7 @@ public class StatiticsCollector implements WordProccessor { prevSet = currentSet; } } - Morphology morphology = new Morphology(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray()); + MorphologyImpl morphology = new MorphologyImpl(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray()); morphology.writeToFile(fileName); } diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianPrefixesBuilder.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianPrefixesBuilder.java deleted file mode 100644 index 8c099ad..0000000 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianPrefixesBuilder.java +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2009 Alexander Kuznetsov - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.morphology.generator; - -import org.apache.lucene.morphology.dictionary.*; - -import java.io.IOException; -import java.util.HashSet; - - -public class RussianPrefixesBuilder { - public static void main(String[] args) throws IOException { - - PrefixesRulesBuilder dictonaryReader = new PrefixesRulesBuilder("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", "windows-1251",new HashSet()); - - - dictonaryReader.proccess(new WordProccessor() { - public void proccess(WordCard wordCard) throws IOException { - - } - }); - - dictonaryReader.savePrefixes("russian/src/main/resources/org/apache/lucene/morphology/russian/prefixes.info"); - } -} \ No newline at end of file diff --git a/dictionary-reader/src/test/java/org/apache/lucene/text.txt b/dictionary-reader/src/test/java/org/apache/lucene/text.txt deleted file mode 100644 index b06e800..0000000 --- a/dictionary-reader/src/test/java/org/apache/lucene/text.txt +++ /dev/null @@ -1,372 +0,0 @@ -[ ть - у - ем - ешь - ете - ет - ут - - ла - ло - ли - я - ши - ем - емте - по ай - ь - по айте - ьте - ущий - ущего - ущему - ущего - ущий - ущим - ущем - ущая - ущей - ущей - ущую - ущей - ущею - ущей - ущее - ущего - ущему - ущее - ущим - ущем - ущие - ущих - ущим - ущих - ущие - ущими - ущих - ший - шего - шему - шего - ший - шим - шем - шая - шей - шей - шую - шей - шею - шей - шее - шего - шему - шее - шим - шем - шие - ших - шим - ших - шие - шими - ших] -[ большой - большого - большому - большого - большой - большим - большом - большая - большой - большой - большую - большой - большою - большой - большое - большого - большому - большое - большим - большом - большие - больших - большим - больших - большие - большими - больших - велик - велика - велико - велики - больше - по больше - наи больший - наи большего - наи большему - наи большего - наи больший - наи большим - наи большем - наи большая - наи большей - наи большей - наи большую - наи большей - наи большею - наи большей - наи большее - наи большего - наи большему - наи большее - наи большим - наи большем - наи большие - наи больших - наи большим - наи больших - наи большие - наи большими - наи больших] -[ вероятный - вероятного - вероятному - вероятного - вероятный - вероятным - вероятном - вероятная - вероятной - вероятной - вероятную - вероятной - вероятною - вероятной - вероятное - вероятного - вероятному - вероятное - вероятным - вероятном - вероятные - вероятных - вероятным - вероятных - вероятные - вероятными - вероятных - вероятен - вероятна - вероятно - вероятны - вероятнее - вероятней - по вероятнее - по вероятней - вероятнейший - наи невероятнейший - вероятнейшего - наи невероятнейшего - вероятнейшему - наи невероятнейшему - вероятнейшего - наи невероятнейшего - вероятнейший - наи невероятнейший - вероятнейшим - наи невероятнейшим - вероятнейшем - наи невероятнейшем - вероятнейшая - наи невероятнейшая - вероятнейшей - наи невероятнейшей - вероятнейшей - наи невероятнейшей - вероятнейшую - наи невероятнейшую - вероятнейшей - вероятнейшею - наи невероятнейшей - наи невероятнейшею - вероятнейшей - наи невероятнейшей - вероятнейшее - наи невероятнейшее - вероятнейшего - наи невероятнейшего - вероятнейшему - наи невероятнейшему - вероятнейшее - наи невероятнейшее - вероятнейшим - наи невероятнейшим - вероятнейшем - наи невероятнейшем - вероятнейшие - наи невероятнейшие - вероятнейших - наи невероятнейших - вероятнейшим - наи невероятнейшим - вероятнейших - наи невероятнейших - вероятнейшие - наи невероятнейшие - вероятнейшими - наи невероятнейшими - вероятнейших - наи невероятнейших] -[ аленький - аленького - аленькому - аленького - аленький - аленьким - аленьком - аленькая - аленькой - аленькой - аленькую - аленькой - аленькою - аленькой - аленькое - аленького - аленькому - аленькое - аленьким - аленьком - аленькие - аленьких - аленьким - аленьких - аленькие - аленькими - аленьких - ал - ала - ало - алы - еньше - по еньше - алейший - наи еньший - алейшего - наи еньшего - алейшему - наи еньшему - алейшего - наи еньшего - алейший - наи еньший - алейшим - наи еньшим - алейшем - наи еньшем - алейшая - наи еньшая - алейшей - наи еньшей - алейшей - наи еньшей - алейшую - наи еньшую - алейшей - алейшею - наи еньшей - наи еньшею - алейшей - наи еньшей - алейшее - наи еньшее - алейшего - наи еньшего - алейшему - наи еньшему - алейшее - наи еньшее - алейшим - наи еньшим - алейшем - наи еньшем - алейшие - наи еньшие - алейших - наи еньших - алейшим - наи еньшим - алейших - наи еньших - алейшие - наи еньшие - алейшими - наи еньшими - алейших - наи еньших] -[ ьный - ьного - ьному - ьного - ьный - ьным - ьном - ьная - ьной - ьной - ьную - ьной - ьною - ьной - ьное - ьного - ьному - ьное - ьным - ьном - ьные - ьных - ьным - ьных - ьные - ьными - ьных - ен - ьна - ьно - ьны - ьны - ьнее - ьней - по ьнее - по ьней - наи ьнейший - наи ьнейшего - наи ьнейшему - наи ьнейшего - наи ьнейший - наи ьнейшим - наи ьнейшем - наи ьнейшая - наи ьнейшей - наи ьнейшей - наи ьнейшую - наи ьнейшей - наи ьнейшею - наи ьнейшей - наи ьнейшее - наи ьнейшего - наи ьнейшему - наи ьнейшее - наи ьнейшим - наи ьнейшем - наи ьнейшие - наи ьнейших - наи ьнейшим - наи ьнейших - наи ьнейшие - наи ьнейшими - наи ьнейших] \ No newline at end of file diff --git a/english/src/main/java/org/apache/lucene/morphology/english/EnglishMorphology.java b/english/src/main/java/org/apache/lucene/morphology/english/EnglishMorphology.java index 6815667..dac5f57 100644 --- a/english/src/main/java/org/apache/lucene/morphology/english/EnglishMorphology.java +++ b/english/src/main/java/org/apache/lucene/morphology/english/EnglishMorphology.java @@ -15,12 +15,12 @@ */ package org.apache.lucene.morphology.english; -import org.apache.lucene.morphology.Morphology; +import org.apache.lucene.morphology.MorphologyImpl; import java.io.IOException; -public class EnglishMorphology extends Morphology { +public class EnglishMorphology extends MorphologyImpl { public EnglishMorphology() throws IOException { super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder()); diff --git a/morph/src/main/java/org/apache/lucene/morphology/LuceneMorphology.java b/morph/src/main/java/org/apache/lucene/morphology/LuceneMorphology.java index a4cb1d6..128ae55 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/LuceneMorphology.java +++ b/morph/src/main/java/org/apache/lucene/morphology/LuceneMorphology.java @@ -23,7 +23,7 @@ import java.util.ArrayList; import java.util.List; -public class LuceneMorphology extends MorphologyWithPrefix { +public class LuceneMorphology extends MorphologyImpl { public LuceneMorphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException { super(fileName, decoderEncoder); @@ -33,13 +33,15 @@ public class LuceneMorphology extends MorphologyWithPrefix { super(inputStream, decoderEncoder); } - public LuceneMorphology(InputStream morphFormInputStream, InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException { - super(morphFormInputStream, prefixesInputStream, decoderEncoder); - } - @Override - protected String createForm(String form, String grammaInfo) { - return form; + public List getMorhInfo(String s) { + ArrayList result = new ArrayList(); + int[] ints = decoderEncoder.encodeToArray(revertWord(s)); + int ruleId = findRuleId(ints); + for (Heuristic h : rules[rulesId[ruleId]]) { + result.add(h.transofrmWord(s)); + } + return result; } protected void readRules(BufferedReader bufferedReader) throws IOException { diff --git a/morph/src/main/java/org/apache/lucene/morphology/Morphology.java b/morph/src/main/java/org/apache/lucene/morphology/Morphology.java index bf8de9d..2d78265 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/Morphology.java +++ b/morph/src/main/java/org/apache/lucene/morphology/Morphology.java @@ -1,214 +1,25 @@ -/** - * Copyright 2009 Alexander Kuznetsov - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.morphology; - - -import java.io.*; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; - - -public class Morphology { - protected int[][] separators; - protected short[] rulesId; - protected Heuristic[][] rules; - protected String[] grammaInfo; - protected LetterDecoderEncoder decoderEncoder; - - - public Morphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException { - readFromFile(fileName); - this.decoderEncoder = decoderEncoder; - } - - public Morphology(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException { - readFromInputStream(inputStream); - this.decoderEncoder = decoderEncoder; - } - - public Morphology(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) { - this.separators = separators; - this.rulesId = rulesId; - this.rules = rules; - this.grammaInfo = grammaInfo; - } - - public int[][] getSeparators() { - return separators; - } - - public short[] getRulesId() { - return rulesId; - } - - public Heuristic[][] getRules() { - return rules; - } - - public String[] getGrammaInfo() { - return grammaInfo; - } - - public List getMorhInfo(String s) { - ArrayList result = new ArrayList(); - int[] ints = decoderEncoder.encodeToArray(revertWord(s)); - int ruleId = findRuleId(ints); - for (Heuristic h : rules[rulesId[ruleId]]) { - result.add(createForm(h.transofrmWord(s),grammaInfo[h.getFormMorphInfo()])); - } - return result; - } - - protected String createForm(String form,String grammaInfo){ - return form+"|"+grammaInfo; - } - - protected int findRuleId(int[] ints) { - int low = 0; - int high = separators.length - 1; - int mid = 0; - while (low <= high) { - mid = (low + high) >>> 1; - int[] midVal = separators[mid]; - - int comResult = compareToInts(ints, midVal); - if (comResult > 0) - low = mid + 1; - else if (comResult < 0) - high = mid - 1; - else - break; - } - if (compareToInts(ints, separators[mid]) >= 0) { - return mid; - } else { - return mid - 1; - } - - } - - private int compareToInts(int[] i1, int[] i2) { - int minLength = Math.min(i1.length, i2.length); - for (int i = 0; i < minLength; i++) { - int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1); - if (i3 != 0) return i3; - } - return i1.length - i2.length; - } - - public void writeToFile(String fileName) throws IOException { - OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8"); - writer.write(separators.length + "\n"); - for (int[] i : separators) { - writer.write(i.length + "\n"); - for (int j : i) { - writer.write(j + "\n"); - } - } - for (short i : rulesId) { - writer.write(i + "\n"); - } - writer.write(rules.length + "\n"); - for (Heuristic[] heuristics : rules) { - writer.write(heuristics.length + "\n"); - for (Heuristic heuristic : heuristics) { - writer.write(heuristic.toString() + "\n"); - } - } - writer.write(grammaInfo.length + "\n"); - for (String s : grammaInfo) { - writer.write(s + "\n"); - } - writer.close(); - } - - public void readFromFile(String fileName) throws IOException { - FileInputStream inputStream = new FileInputStream(fileName); - readFromInputStream(inputStream); - } - - private void readFromInputStream(InputStream inputStream) throws IOException { - BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")); - String s = bufferedReader.readLine(); - Integer amount = Integer.valueOf(s); - - readSeparators(bufferedReader, amount); - - readRulesId(bufferedReader, amount); - - readRules(bufferedReader); - readGrammaInfo(bufferedReader); - bufferedReader.close(); - } - - private void readGrammaInfo(BufferedReader bufferedReader) throws IOException { - String s; - Integer amount; - s = bufferedReader.readLine(); - amount = Integer.valueOf(s); - grammaInfo = new String[amount]; - for (int i = 0; i < amount; i++) { - grammaInfo[i] = bufferedReader.readLine(); - } - } - - protected void readRules(BufferedReader bufferedReader) throws IOException { - String s; - Integer amount; - s = bufferedReader.readLine(); - amount = Integer.valueOf(s); - rules = new Heuristic[amount][]; - for (int i = 0; i < amount; i++) { - String s1 = bufferedReader.readLine(); - Integer ruleLenght = Integer.valueOf(s1); - rules[i] = new Heuristic[ruleLenght]; - for (int j = 0; j < ruleLenght; j++) { - rules[i][j] = new Heuristic(bufferedReader.readLine()); - } - } - } - - private void readRulesId(BufferedReader bufferedReader, Integer amount) throws IOException { - rulesId = new short[amount]; - for (int i = 0; i < amount; i++) { - String s1 = bufferedReader.readLine(); - rulesId[i] = Short.valueOf(s1); - } - } - - private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException { - HashSet intetger = new HashSet(); - separators = new int[amount][]; - for (int i = 0; i < amount; i++) { - String s1 = bufferedReader.readLine(); - Integer wordLenght = Integer.valueOf(s1); - separators[i] = new int[wordLenght]; - for (int j = 0; j < wordLenght; j++) { - separators[i][j] = Integer.valueOf(bufferedReader.readLine()); - } - intetger.add(separators[i][0]); - } - } - - protected String revertWord(String s) { - String result = ""; - for (int i = 1; i <= s.length(); i++) { - result += s.charAt(s.length() - i); - } - return result; - } -} +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.morphology; + +import java.util.List; + + +public interface Morphology { + + List getMorhInfo(String s); + +} diff --git a/morph/src/main/java/org/apache/lucene/morphology/MorphologyImpl.java b/morph/src/main/java/org/apache/lucene/morphology/MorphologyImpl.java new file mode 100644 index 0000000..ed1b1ec --- /dev/null +++ b/morph/src/main/java/org/apache/lucene/morphology/MorphologyImpl.java @@ -0,0 +1,210 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.morphology; + + +import java.io.*; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; + + +public class MorphologyImpl implements Morphology { + protected int[][] separators; + protected short[] rulesId; + protected Heuristic[][] rules; + protected String[] grammaInfo; + protected LetterDecoderEncoder decoderEncoder; + + + public MorphologyImpl(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException { + readFromFile(fileName); + this.decoderEncoder = decoderEncoder; + } + + public MorphologyImpl(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException { + readFromInputStream(inputStream); + this.decoderEncoder = decoderEncoder; + } + + public MorphologyImpl(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) { + this.separators = separators; + this.rulesId = rulesId; + this.rules = rules; + this.grammaInfo = grammaInfo; + } + + public int[][] getSeparators() { + return separators; + } + + public short[] getRulesId() { + return rulesId; + } + + public Heuristic[][] getRules() { + return rules; + } + + public String[] getGrammaInfo() { + return grammaInfo; + } + + public List getMorhInfo(String s) { + ArrayList result = new ArrayList(); + int[] ints = decoderEncoder.encodeToArray(revertWord(s)); + int ruleId = findRuleId(ints); + for (Heuristic h : rules[rulesId[ruleId]]) { + result.add(h.transofrmWord(s) + "|" + grammaInfo[h.getFormMorphInfo()]); + } + return result; + } + + protected int findRuleId(int[] ints) { + int low = 0; + int high = separators.length - 1; + int mid = 0; + while (low <= high) { + mid = (low + high) >>> 1; + int[] midVal = separators[mid]; + + int comResult = compareToInts(ints, midVal); + if (comResult > 0) + low = mid + 1; + else if (comResult < 0) + high = mid - 1; + else + break; + } + if (compareToInts(ints, separators[mid]) >= 0) { + return mid; + } else { + return mid - 1; + } + + } + + private int compareToInts(int[] i1, int[] i2) { + int minLength = Math.min(i1.length, i2.length); + for (int i = 0; i < minLength; i++) { + int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1); + if (i3 != 0) return i3; + } + return i1.length - i2.length; + } + + public void writeToFile(String fileName) throws IOException { + OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8"); + writer.write(separators.length + "\n"); + for (int[] i : separators) { + writer.write(i.length + "\n"); + for (int j : i) { + writer.write(j + "\n"); + } + } + for (short i : rulesId) { + writer.write(i + "\n"); + } + writer.write(rules.length + "\n"); + for (Heuristic[] heuristics : rules) { + writer.write(heuristics.length + "\n"); + for (Heuristic heuristic : heuristics) { + writer.write(heuristic.toString() + "\n"); + } + } + writer.write(grammaInfo.length + "\n"); + for (String s : grammaInfo) { + writer.write(s + "\n"); + } + writer.close(); + } + + public void readFromFile(String fileName) throws IOException { + FileInputStream inputStream = new FileInputStream(fileName); + readFromInputStream(inputStream); + } + + private void readFromInputStream(InputStream inputStream) throws IOException { + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")); + String s = bufferedReader.readLine(); + Integer amount = Integer.valueOf(s); + + readSeparators(bufferedReader, amount); + + readRulesId(bufferedReader, amount); + + readRules(bufferedReader); + readGrammaInfo(bufferedReader); + bufferedReader.close(); + } + + private void readGrammaInfo(BufferedReader bufferedReader) throws IOException { + String s; + Integer amount; + s = bufferedReader.readLine(); + amount = Integer.valueOf(s); + grammaInfo = new String[amount]; + for (int i = 0; i < amount; i++) { + grammaInfo[i] = bufferedReader.readLine(); + } + } + + protected void readRules(BufferedReader bufferedReader) throws IOException { + String s; + Integer amount; + s = bufferedReader.readLine(); + amount = Integer.valueOf(s); + rules = new Heuristic[amount][]; + for (int i = 0; i < amount; i++) { + String s1 = bufferedReader.readLine(); + Integer ruleLenght = Integer.valueOf(s1); + rules[i] = new Heuristic[ruleLenght]; + for (int j = 0; j < ruleLenght; j++) { + rules[i][j] = new Heuristic(bufferedReader.readLine()); + } + } + } + + private void readRulesId(BufferedReader bufferedReader, Integer amount) throws IOException { + rulesId = new short[amount]; + for (int i = 0; i < amount; i++) { + String s1 = bufferedReader.readLine(); + rulesId[i] = Short.valueOf(s1); + } + } + + private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException { + HashSet intetger = new HashSet(); + separators = new int[amount][]; + for (int i = 0; i < amount; i++) { + String s1 = bufferedReader.readLine(); + Integer wordLenght = Integer.valueOf(s1); + separators[i] = new int[wordLenght]; + for (int j = 0; j < wordLenght; j++) { + separators[i][j] = Integer.valueOf(bufferedReader.readLine()); + } + intetger.add(separators[i][0]); + } + } + + protected String revertWord(String s) { + String result = ""; + for (int i = 1; i <= s.length(); i++) { + result += s.charAt(s.length() - i); + } + return result; + } +} diff --git a/morph/src/main/java/org/apache/lucene/morphology/MorphologyWithPrefix.java b/morph/src/main/java/org/apache/lucene/morphology/MorphologyWithPrefix.java deleted file mode 100644 index 7cc01c8..0000000 --- a/morph/src/main/java/org/apache/lucene/morphology/MorphologyWithPrefix.java +++ /dev/null @@ -1,96 +0,0 @@ -/** - * Copyright 2009 Alexander Kuznetsov - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.morphology; - -import java.io.IOException; -import java.io.InputStream; -import java.io.BufferedReader; -import java.io.InputStreamReader; -import java.util.*; - - -public class MorphologyWithPrefix extends Morphology { - private Map prefixRuleMap = new HashMap(); - - public MorphologyWithPrefix(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException { - super(fileName, decoderEncoder); - } - - public MorphologyWithPrefix(InputStream morphFormInputStream, LetterDecoderEncoder decoderEncoder) throws IOException { - super(morphFormInputStream, decoderEncoder); - } - - public MorphologyWithPrefix(InputStream morphFormInputStream,InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException { - super(morphFormInputStream, decoderEncoder); - readPrefixes(prefixesInputStream); - } - - private void readPrefixes(InputStream inputStream) throws IOException { - BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")); - Integer prefixAmount = Integer.parseInt(bufferedReader.readLine()); - for(int i = 0; i < prefixAmount;i++){ - PrefixRule prefixRule = readPrefix(bufferedReader); - prefixRuleMap.put(prefixRule.getHashString(),prefixRule); - } - bufferedReader.close(); - } - - private PrefixRule readPrefix(BufferedReader bufferedReader) throws IOException { - PrefixRule prefixRule = new PrefixRule(); - String s = bufferedReader.readLine(); - prefixRule.setPrefix(s); - s = bufferedReader.readLine(); - prefixRule.setLastLetter(s.charAt(0)); - HashSet morph = new HashSet(); - int formAmount = Integer.valueOf(bufferedReader.readLine()); - for(int i = 0; i < formAmount; i++){ - morph.add(Short.valueOf(bufferedReader.readLine())); - } - prefixRule.setForms(morph); - return prefixRule; - } - - public MorphologyWithPrefix(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) { - super(separators, rulesId, rules, grammaInfo); - } - - @Override - public List getMorhInfo(String s) { - if (prefixRuleMap.size() == 0 || s.length() < 4) { - return super.getMorhInfo(s); - } - String ruleIndex = "" + s.charAt(0) + s.charAt(s.length() - 1); - PrefixRule prefixRule = prefixRuleMap.get(ruleIndex); - if (prefixRule == null) { - return super.getMorhInfo(s); - } - if (!s.startsWith(prefixRule.getPrefix())) { - return super.getMorhInfo(s); - } - String sWithoutPrefix = s.substring(prefixRule.getPrefix().length()); - - int[] ints = decoderEncoder.encodeToArray(revertWord(sWithoutPrefix)); - int ruleId = findRuleId(ints); - ArrayList result = new ArrayList(); - for (Heuristic h : rules[rulesId[ruleId]]) { - //String morphInfo = grammaInfo[]; - if(prefixRule.getForms().contains(h.getFormMorphInfo())){ - result.add(createForm(h.transofrmWord(sWithoutPrefix),"pr")); - } - } - return result.size() > 0 ? result : super.getMorhInfo(s); - } -} diff --git a/morph/src/main/java/org/apache/lucene/morphology/PrefixRule.java b/morph/src/main/java/org/apache/lucene/morphology/PrefixRule.java deleted file mode 100644 index 8536869..0000000 --- a/morph/src/main/java/org/apache/lucene/morphology/PrefixRule.java +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Copyright 2009 Alexander Kuznetsov - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.morphology; - -import java.io.Serializable; -import java.util.HashSet; - - -public class PrefixRule implements Serializable { - private Character lastLetter; - private String prefix; - private HashSet forms; - - public Character getLastLetter() { - return lastLetter; - } - - public void setLastLetter(Character lastLetter) { - this.lastLetter = lastLetter; - } - - public String getPrefix() { - return prefix; - } - - public void setPrefix(String prefix) { - this.prefix = prefix; - } - - public HashSet getForms() { - return forms; - } - - public void setForms(HashSet forms) { - this.forms = forms; - } - - public String getHashString() { - return "" + prefix.charAt(0) + lastLetter; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - PrefixRule that = (PrefixRule) o; - - if (forms != null ? !forms.equals(that.forms) : that.forms != null) return false; - if (lastLetter != null ? !lastLetter.equals(that.lastLetter) : that.lastLetter != null) return false; - if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false; - - return true; - } - - @Override - public int hashCode() { - int result = lastLetter != null ? lastLetter.hashCode() : 0; - result = 31 * result + (prefix != null ? prefix.hashCode() : 0); - result = 31 * result + (forms != null ? forms.hashCode() : 0); - return result; - } -} diff --git a/russian/src/main/java/org/apache/lucene/morphology/russian/RussianLuceneMorphology.java b/russian/src/main/java/org/apache/lucene/morphology/russian/RussianLuceneMorphology.java index 9d50878..9acb266 100644 --- a/russian/src/main/java/org/apache/lucene/morphology/russian/RussianLuceneMorphology.java +++ b/russian/src/main/java/org/apache/lucene/morphology/russian/RussianLuceneMorphology.java @@ -22,6 +22,6 @@ import java.io.IOException; public class RussianLuceneMorphology extends LuceneMorphology { public RussianLuceneMorphology() throws IOException { - super(RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"),RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/prefixes.info"), new RussianLetterDecoderEncoder()); + super(RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder()); } } \ No newline at end of file diff --git a/russian/src/main/java/org/apache/lucene/morphology/russian/RussianMorphology.java b/russian/src/main/java/org/apache/lucene/morphology/russian/RussianMorphology.java index 3d47fcd..6655521 100644 --- a/russian/src/main/java/org/apache/lucene/morphology/russian/RussianMorphology.java +++ b/russian/src/main/java/org/apache/lucene/morphology/russian/RussianMorphology.java @@ -15,11 +15,11 @@ */ package org.apache.lucene.morphology.russian; -import org.apache.lucene.morphology.Morphology; +import org.apache.lucene.morphology.MorphologyImpl; import java.io.IOException; -public class RussianMorphology extends Morphology { +public class RussianMorphology extends MorphologyImpl { public RussianMorphology() throws IOException { super(RussianMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder()); diff --git a/russian/src/main/java/org/apache/lucene/morphology/russian/TestSpeed.java b/russian/src/main/java/org/apache/lucene/morphology/russian/TestSpeed.java deleted file mode 100644 index e0b33f8..0000000 --- a/russian/src/main/java/org/apache/lucene/morphology/russian/TestSpeed.java +++ /dev/null @@ -1,60 +0,0 @@ -/** - * Copyright 2009 Alexander Kuznetsov - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.morphology.russian; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; - -import java.io.IOException; -import java.io.FileInputStream; -import java.io.InputStreamReader; -import java.util.HashSet; - -/** - * Created by IntelliJ IDEA. - * User: akuznetsov - * Date: 31.10.2009 - * Time: 14:01:11 - * To change this template use File | Settings | File Templates. - */ -public class TestSpeed { - - public static void main(String[] args) throws IOException { - RussianAnalayzer russianAnalayzer = new RussianAnalayzer(); - bookProccess(russianAnalayzer, "C:/tmp/_Aleksandr_Suhov_Tanets_na_raskalennyih_uglyah1.fb2"); - Long stat = System.currentTimeMillis(); - bookProccess(russianAnalayzer, "C:/tmp/_Aleksandr_Suhov_Tanets_na_raskalennyih_uglyah1.fb2"); - System.out.println("Done in " + (System.currentTimeMillis() - stat)); - } - - private static void bookProccess(RussianAnalayzer russianAnalayzer, String bookName) throws IOException { - FileInputStream inputStream = new FileInputStream(bookName); - TokenStream tokenStream = russianAnalayzer.tokenStream(null,new InputStreamReader(inputStream,"UTF-8")); - final Token reusableToken = new Token(); - long count = 0; - Token nextToken; - for (; ;) { - nextToken = tokenStream.next(reusableToken); - // System.out.println(" " + nextToken.term()); - count++; - if (nextToken == null) { - break; - } - - } - //System.out.println("Words " + count); - } -} diff --git a/russian/src/main/resources/org/apache/lucene/morphology/russian/prefixes.info b/russian/src/main/resources/org/apache/lucene/morphology/russian/prefixes.info deleted file mode 100644 index 3a9f127..0000000 --- a/russian/src/main/resources/org/apache/lucene/morphology/russian/prefixes.info +++ /dev/null @@ -1,96 +0,0 @@ -11 -наи -е -8 -258 -255 -289 -252 -292 -262 -296 -286 -наи -и -2 -263 -297 -наи -ю -4 -250 -249 -283 -284 -по -й -5 -250 -251 -248 -247 -269 -по -е -3 -255 -252 -269 -наи -й -12 -239 -273 -250 -251 -248 -277 -247 -282 -281 -243 -285 -284 -наи -о -6 -274 -253 -276 -287 -242 -240 -наи -м -10 -256 -290 -257 -291 -279 -278 -294 -260 -244 -245 -наи -х -6 -259 -293 -261 -295 -264 -298 -наи -я -2 -246 -280 -наи -у -4 -275 -254 -288 -241 diff --git a/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLuceneMorphTest.java b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLuceneMorphTest.java index 8df5b00..9337521 100644 --- a/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLuceneMorphTest.java +++ b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLuceneMorphTest.java @@ -33,7 +33,7 @@ public class RussianLuceneMorphTest { @Before public void setUp() throws IOException { - luceneMorph = new RussianLuceneMorphology(); + luceneMorph = new LuceneMorphology(this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder()); } @Test diff --git a/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-morphology-test.txt b/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-morphology-test.txt index 0cc1e23..c775e7d 100644 --- a/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-morphology-test.txt +++ b/russian/src/test/resources/org/apache/lucene/morphology/russian/russian-morphology-test.txt @@ -1,4 +1,3 @@ -наилучший хороший еду еда ехать тестов тест вина вино вина @@ -17,8 +16,4 @@ тосклив тоскливый лучший хороший на на -тест тест тесто -спам спам -спама спам -наигранный наигранный -наивный наивный +тест тест тесто \ No newline at end of file