rallback on wrong version of morphology, adding interafce for morphology
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@88 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
		| @@ -31,9 +31,9 @@ import java.util.*; | ||||
| public class DictonaryReader { | ||||
|     private String fileName; | ||||
|     private String fileEncoding = "windows-1251"; | ||||
|     protected List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>(); | ||||
|     protected List<List<String>> wordPrefixes = new ArrayList<List<String>>(); | ||||
|     protected Set<String> ingnoredForm = new HashSet<String>(); | ||||
|     private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>(); | ||||
|     private List<List<String>> wordPrefixes = new ArrayList<List<String>>(); | ||||
|     private Set<String> ingnoredForm = new HashSet<String>(); | ||||
|  | ||||
|     public DictonaryReader(String fileName, Set<String> ingnoredForm) { | ||||
|         this.fileName = fileName; | ||||
| @@ -57,7 +57,7 @@ public class DictonaryReader { | ||||
|     } | ||||
|  | ||||
|  | ||||
|     protected void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException { | ||||
|     private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException { | ||||
|         String s = reader.readLine(); | ||||
|         int count = Integer.valueOf(s); | ||||
|         for (int i = 0; i < count; i++) { | ||||
| @@ -81,7 +81,7 @@ public class DictonaryReader { | ||||
|     } | ||||
|  | ||||
|  | ||||
|     protected void sckipBlock(BufferedReader reader) throws IOException { | ||||
|     private void sckipBlock(BufferedReader reader) throws IOException { | ||||
|         String s = reader.readLine(); | ||||
|         int count = Integer.valueOf(s); | ||||
|         for (int i = 0; i < count; i++) { | ||||
| @@ -90,7 +90,7 @@ public class DictonaryReader { | ||||
|     } | ||||
|  | ||||
|  | ||||
|     protected void readPrefix(BufferedReader reader) throws IOException { | ||||
|     private void readPrefix(BufferedReader reader) throws IOException { | ||||
|         String s = reader.readLine(); | ||||
|         int count = Integer.valueOf(s); | ||||
|         for (int i = 0; i < count; i++) { | ||||
| @@ -99,7 +99,7 @@ public class DictonaryReader { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     protected  void readFlexias(BufferedReader reader) throws IOException { | ||||
|     private void readFlexias(BufferedReader reader) throws IOException { | ||||
|         String s = reader.readLine(); | ||||
|         int count = Integer.valueOf(s); | ||||
|         for (int i = 0; i < count; i++) { | ||||
| @@ -112,7 +112,7 @@ public class DictonaryReader { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     protected  void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) { | ||||
|     private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) { | ||||
|         String[] fl = line.split("\\*"); | ||||
|         // we inored all forms thats | ||||
|         if (fl.length == 3) { | ||||
|   | ||||
| @@ -60,28 +60,6 @@ public class FlexiaModel { | ||||
|  | ||||
|     @Override | ||||
|     public String toString() { | ||||
|         return prefix + " " + suffix + " " + code; | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public boolean equals(Object o) { | ||||
|         if (this == o) return true; | ||||
|         if (o == null || getClass() != o.getClass()) return false; | ||||
|  | ||||
|         FlexiaModel that = (FlexiaModel) o; | ||||
|  | ||||
|         if (code != null ? !code.equals(that.code) : that.code != null) return false; | ||||
|         if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false; | ||||
|         if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false; | ||||
|  | ||||
|         return true; | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public int hashCode() { | ||||
|         int result = code != null ? code.hashCode() : 0; | ||||
|         result = 31 * result + (suffix != null ? suffix.hashCode() : 0); | ||||
|         result = 31 * result + (prefix != null ? prefix.hashCode() : 0); | ||||
|         return result; | ||||
|         return prefix + " " + suffix; | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,139 +0,0 @@ | ||||
| /** | ||||
|  * Copyright 2009 Alexander Kuznetsov | ||||
|  * | ||||
|  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|  * you may not use this file except in compliance with the License. | ||||
|  * You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.lucene.morphology.dictionary; | ||||
|  | ||||
| import org.apache.lucene.morphology.PrefixRule; | ||||
|  | ||||
| import java.util.*; | ||||
| import java.io.*; | ||||
|  | ||||
|  | ||||
| public class PrefixesRulesBuilder extends DictonaryReader { | ||||
|     private GrammaReader grammaInfo; | ||||
|  | ||||
|     private Map<FlexiaModel,Set<FlexiaModel>> rules = new HashMap<FlexiaModel,Set<FlexiaModel>>(); | ||||
|  | ||||
|     public PrefixesRulesBuilder(String fileName, String fileEncoding, Set<String> ingnoredForm) throws IOException { | ||||
|         super(fileName, fileEncoding, ingnoredForm); | ||||
|         grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab"); | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public void proccess(WordProccessor wordProccessor) throws IOException { | ||||
|         super.proccess(wordProccessor); | ||||
|         System.out.println(rules.size()); | ||||
|         System.out.println(rules); | ||||
|     } | ||||
|  | ||||
|     public List<PrefixRule> getPrefixRules(){ | ||||
|         List<PrefixRule> prefixRules = new ArrayList<PrefixRule>(); | ||||
|         for(FlexiaModel key:rules.keySet()){ | ||||
|             PrefixRule prefixRule = new PrefixRule(); | ||||
|             prefixRule.setPrefix(key.getPrefix()); | ||||
|             prefixRule.setLastLetter(key.getSuffix().charAt(0)); | ||||
|             HashSet<Short> map = new HashSet<Short>(); | ||||
|             for(FlexiaModel fm:rules.get(key)){ | ||||
|                 int gi = grammaInfo.getGrammInversIndex().get(fm.getCode()); | ||||
|                 map.add((short) gi); | ||||
|             } | ||||
|             prefixRule.setForms(map); | ||||
|             prefixRules.add(prefixRule); | ||||
|         } | ||||
|         return prefixRules; | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     protected void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException { | ||||
|         sckipBlock(reader); | ||||
|     } | ||||
|  | ||||
|  | ||||
|  | ||||
|     @Override | ||||
|     protected void readPrefix(BufferedReader reader) throws IOException { | ||||
|         sckipBlock(reader); | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     protected void readFlexias(BufferedReader reader) throws IOException { | ||||
|         super.readFlexias(reader); | ||||
|         //todo research flesias | ||||
|         for(List<FlexiaModel> fmList:wordsFlexias){ | ||||
|             research(fmList); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     private void research(List<FlexiaModel> models) { | ||||
|         for(FlexiaModel fm:models){ | ||||
|             if(fm.getPrefix().length() > 0){ | ||||
|                 testFlexia(models, fm); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     private void testFlexia(List<FlexiaModel> models, FlexiaModel fm) { | ||||
|         for(FlexiaModel com:models){ | ||||
|             if(com.getSuffix().equals(fm.getSuffix()) && com.getPrefix().length() == 0){ | ||||
|                 Set<FlexiaModel> models1 = rules.get(convertForKey(fm)); | ||||
|                 if(models1 == null){ | ||||
|                     models1 = new HashSet<FlexiaModel>(); | ||||
|                     rules.put(convertForKey(fm),models1); | ||||
|                 } | ||||
|                 models1.add(convert(com)); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     private FlexiaModel convert(FlexiaModel fm){ | ||||
|         String suf = fm.getSuffix(); | ||||
|         //if(suf.length() == 1) System.out.println(fm); | ||||
|         return new FlexiaModel(fm.getCode(),""+ suf.charAt(suf.length()-1),fm.getPrefix()); | ||||
|     } | ||||
|  | ||||
|     private FlexiaModel convertForKey(FlexiaModel fm){ | ||||
|         String suf = fm.getSuffix(); | ||||
|         //if(suf.length() == 1) System.out.println(fm); | ||||
|         return new FlexiaModel("pr",""+ suf.charAt(suf.length()-1),fm.getPrefix()); | ||||
|     } | ||||
|  | ||||
|     protected void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) { | ||||
|         String[] fl = line.split("\\*"); | ||||
|         if (fl.length == 3) { | ||||
|             flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase())); | ||||
|         } | ||||
|         if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), "")); | ||||
|     } | ||||
|  | ||||
|     public void savePrefixes(String fileName) throws IOException { | ||||
|         OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8"); | ||||
|         List<PrefixRule> prefixRuleList = getPrefixRules(); | ||||
|         writer.write(prefixRuleList.size()+"\n"); | ||||
|         for(PrefixRule pr: prefixRuleList){ | ||||
|             writePrefixRule(writer, pr); | ||||
|         } | ||||
|         writer.close(); | ||||
|     } | ||||
|  | ||||
|     private void writePrefixRule(OutputStreamWriter writer, PrefixRule pr) throws IOException { | ||||
|         writer.write(pr.getPrefix()+"\n"); | ||||
|         writer.write(pr.getLastLetter()+"\n"); | ||||
|         HashSet<Short> formInfo = pr.getForms(); | ||||
|         writer.write(formInfo.size()+"\n"); | ||||
|         for(Short s:formInfo){ | ||||
|             writer.write(s+"\n"); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| @@ -19,7 +19,7 @@ package org.apache.lucene.morphology.dictionary; | ||||
|  | ||||
| import org.apache.lucene.morphology.Heuristic; | ||||
| import org.apache.lucene.morphology.LetterDecoderEncoder; | ||||
| import org.apache.lucene.morphology.Morphology; | ||||
| import org.apache.lucene.morphology.MorphologyImpl; | ||||
|  | ||||
| import java.io.IOException; | ||||
| import java.util.*; | ||||
| @@ -119,7 +119,7 @@ public class StatiticsCollector implements WordProccessor { | ||||
|                 prevSet = currentSet; | ||||
|             } | ||||
|         } | ||||
|         Morphology morphology = new Morphology(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray()); | ||||
|         MorphologyImpl morphology = new MorphologyImpl(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray()); | ||||
|         morphology.writeToFile(fileName); | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -1,39 +0,0 @@ | ||||
| /** | ||||
|  * Copyright 2009 Alexander Kuznetsov  | ||||
|  * | ||||
|  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|  * you may not use this file except in compliance with the License. | ||||
|  * You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
|  | ||||
| package org.apache.lucene.morphology.generator; | ||||
|  | ||||
| import org.apache.lucene.morphology.dictionary.*; | ||||
|  | ||||
| import java.io.IOException; | ||||
| import java.util.HashSet; | ||||
|  | ||||
|  | ||||
| public class RussianPrefixesBuilder { | ||||
|     public static void main(String[] args) throws IOException { | ||||
|  | ||||
|         PrefixesRulesBuilder dictonaryReader = new PrefixesRulesBuilder("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", "windows-1251",new HashSet<String>()); | ||||
|  | ||||
|  | ||||
|         dictonaryReader.proccess(new WordProccessor() { | ||||
|             public void proccess(WordCard wordCard) throws IOException { | ||||
|  | ||||
|             } | ||||
|         }); | ||||
|  | ||||
|         dictonaryReader.savePrefixes("russian/src/main/resources/org/apache/lucene/morphology/russian/prefixes.info"); | ||||
|     } | ||||
| } | ||||
| @@ -1,372 +0,0 @@ | ||||
| [ ть | ||||
|   у | ||||
|   ем | ||||
|   ешь | ||||
|   ете | ||||
|   ет | ||||
|   ут | ||||
|  | ||||
|   ла | ||||
|   ло | ||||
|   ли | ||||
|   я | ||||
|   ши | ||||
|   ем | ||||
|   емте | ||||
|  по ай | ||||
|   ь | ||||
|  по айте | ||||
|   ьте | ||||
|   ущий | ||||
|   ущего | ||||
|   ущему | ||||
|   ущего | ||||
|   ущий | ||||
|   ущим | ||||
|   ущем | ||||
|   ущая | ||||
|   ущей | ||||
|   ущей | ||||
|   ущую | ||||
|   ущей | ||||
|   ущею | ||||
|   ущей | ||||
|   ущее | ||||
|   ущего | ||||
|   ущему | ||||
|   ущее | ||||
|   ущим | ||||
|   ущем | ||||
|   ущие | ||||
|   ущих | ||||
|   ущим | ||||
|   ущих | ||||
|   ущие | ||||
|   ущими | ||||
|   ущих | ||||
|   ший | ||||
|   шего | ||||
|   шему | ||||
|   шего | ||||
|   ший | ||||
|   шим | ||||
|   шем | ||||
|   шая | ||||
|   шей | ||||
|   шей | ||||
|   шую | ||||
|   шей | ||||
|   шею | ||||
|   шей | ||||
|   шее | ||||
|   шего | ||||
|   шему | ||||
|   шее | ||||
|   шим | ||||
|   шем | ||||
|   шие | ||||
|   ших | ||||
|   шим | ||||
|   ших | ||||
|   шие | ||||
|   шими | ||||
|   ших] | ||||
| [ большой | ||||
|   большого | ||||
|   большому | ||||
|   большого | ||||
|   большой | ||||
|   большим | ||||
|   большом | ||||
|   большая | ||||
|   большой | ||||
|   большой | ||||
|   большую | ||||
|   большой | ||||
|   большою | ||||
|   большой | ||||
|   большое | ||||
|   большого | ||||
|   большому | ||||
|   большое | ||||
|   большим | ||||
|   большом | ||||
|   большие | ||||
|   больших | ||||
|   большим | ||||
|   больших | ||||
|   большие | ||||
|   большими | ||||
|   больших | ||||
|   велик | ||||
|   велика | ||||
|   велико | ||||
|   велики | ||||
|   больше | ||||
|  по больше | ||||
|  наи больший | ||||
|  наи большего | ||||
|  наи большему | ||||
|  наи большего | ||||
|  наи больший | ||||
|  наи большим | ||||
|  наи большем | ||||
|  наи большая | ||||
|  наи большей | ||||
|  наи большей | ||||
|  наи большую | ||||
|  наи большей | ||||
|  наи большею | ||||
|  наи большей | ||||
|  наи большее | ||||
|  наи большего | ||||
|  наи большему | ||||
|  наи большее | ||||
|  наи большим | ||||
|  наи большем | ||||
|  наи большие | ||||
|  наи больших | ||||
|  наи большим | ||||
|  наи больших | ||||
|  наи большие | ||||
|  наи большими | ||||
|  наи больших] | ||||
| [ вероятный | ||||
|   вероятного | ||||
|   вероятному | ||||
|   вероятного | ||||
|   вероятный | ||||
|   вероятным | ||||
|   вероятном | ||||
|   вероятная | ||||
|   вероятной | ||||
|   вероятной | ||||
|   вероятную | ||||
|   вероятной | ||||
|   вероятною | ||||
|   вероятной | ||||
|   вероятное | ||||
|   вероятного | ||||
|   вероятному | ||||
|   вероятное | ||||
|   вероятным | ||||
|   вероятном | ||||
|   вероятные | ||||
|   вероятных | ||||
|   вероятным | ||||
|   вероятных | ||||
|   вероятные | ||||
|   вероятными | ||||
|   вероятных | ||||
|   вероятен | ||||
|   вероятна | ||||
|   вероятно | ||||
|   вероятны | ||||
|   вероятнее | ||||
|   вероятней | ||||
|  по вероятнее | ||||
|  по вероятней | ||||
|   вероятнейший | ||||
|  наи невероятнейший | ||||
|   вероятнейшего | ||||
|  наи невероятнейшего | ||||
|   вероятнейшему | ||||
|  наи невероятнейшему | ||||
|   вероятнейшего | ||||
|  наи невероятнейшего | ||||
|   вероятнейший | ||||
|  наи невероятнейший | ||||
|   вероятнейшим | ||||
|  наи невероятнейшим | ||||
|   вероятнейшем | ||||
|  наи невероятнейшем | ||||
|   вероятнейшая | ||||
|  наи невероятнейшая | ||||
|   вероятнейшей | ||||
|  наи невероятнейшей | ||||
|   вероятнейшей | ||||
|  наи невероятнейшей | ||||
|   вероятнейшую | ||||
|  наи невероятнейшую | ||||
|   вероятнейшей | ||||
|   вероятнейшею | ||||
|  наи невероятнейшей | ||||
|  наи невероятнейшею | ||||
|   вероятнейшей | ||||
|  наи невероятнейшей | ||||
|   вероятнейшее | ||||
|  наи невероятнейшее | ||||
|   вероятнейшего | ||||
|  наи невероятнейшего | ||||
|   вероятнейшему | ||||
|  наи невероятнейшему | ||||
|   вероятнейшее | ||||
|  наи невероятнейшее | ||||
|   вероятнейшим | ||||
|  наи невероятнейшим | ||||
|   вероятнейшем | ||||
|  наи невероятнейшем | ||||
|   вероятнейшие | ||||
|  наи невероятнейшие | ||||
|   вероятнейших | ||||
|  наи невероятнейших | ||||
|   вероятнейшим | ||||
|  наи невероятнейшим | ||||
|   вероятнейших | ||||
|  наи невероятнейших | ||||
|   вероятнейшие | ||||
|  наи невероятнейшие | ||||
|   вероятнейшими | ||||
|  наи невероятнейшими | ||||
|   вероятнейших | ||||
|  наи невероятнейших] | ||||
| [ аленький | ||||
|   аленького | ||||
|   аленькому | ||||
|   аленького | ||||
|   аленький | ||||
|   аленьким | ||||
|   аленьком | ||||
|   аленькая | ||||
|   аленькой | ||||
|   аленькой | ||||
|   аленькую | ||||
|   аленькой | ||||
|   аленькою | ||||
|   аленькой | ||||
|   аленькое | ||||
|   аленького | ||||
|   аленькому | ||||
|   аленькое | ||||
|   аленьким | ||||
|   аленьком | ||||
|   аленькие | ||||
|   аленьких | ||||
|   аленьким | ||||
|   аленьких | ||||
|   аленькие | ||||
|   аленькими | ||||
|   аленьких | ||||
|   ал | ||||
|   ала | ||||
|   ало | ||||
|   алы | ||||
|   еньше | ||||
|  по еньше | ||||
|   алейший | ||||
|  наи еньший | ||||
|   алейшего | ||||
|  наи еньшего | ||||
|   алейшему | ||||
|  наи еньшему | ||||
|   алейшего | ||||
|  наи еньшего | ||||
|   алейший | ||||
|  наи еньший | ||||
|   алейшим | ||||
|  наи еньшим | ||||
|   алейшем | ||||
|  наи еньшем | ||||
|   алейшая | ||||
|  наи еньшая | ||||
|   алейшей | ||||
|  наи еньшей | ||||
|   алейшей | ||||
|  наи еньшей | ||||
|   алейшую | ||||
|  наи еньшую | ||||
|   алейшей | ||||
|   алейшею | ||||
|  наи еньшей | ||||
|  наи еньшею | ||||
|   алейшей | ||||
|  наи еньшей | ||||
|   алейшее | ||||
|  наи еньшее | ||||
|   алейшего | ||||
|  наи еньшего | ||||
|   алейшему | ||||
|  наи еньшему | ||||
|   алейшее | ||||
|  наи еньшее | ||||
|   алейшим | ||||
|  наи еньшим | ||||
|   алейшем | ||||
|  наи еньшем | ||||
|   алейшие | ||||
|  наи еньшие | ||||
|   алейших | ||||
|  наи еньших | ||||
|   алейшим | ||||
|  наи еньшим | ||||
|   алейших | ||||
|  наи еньших | ||||
|   алейшие | ||||
|  наи еньшие | ||||
|   алейшими | ||||
|  наи еньшими | ||||
|   алейших | ||||
|  наи еньших] | ||||
| [ ьный | ||||
|   ьного | ||||
|   ьному | ||||
|   ьного | ||||
|   ьный | ||||
|   ьным | ||||
|   ьном | ||||
|   ьная | ||||
|   ьной | ||||
|   ьной | ||||
|   ьную | ||||
|   ьной | ||||
|   ьною | ||||
|   ьной | ||||
|   ьное | ||||
|   ьного | ||||
|   ьному | ||||
|   ьное | ||||
|   ьным | ||||
|   ьном | ||||
|   ьные | ||||
|   ьных | ||||
|   ьным | ||||
|   ьных | ||||
|   ьные | ||||
|   ьными | ||||
|   ьных | ||||
|   ен | ||||
|   ьна | ||||
|   ьно | ||||
|   ьны | ||||
|   ьны | ||||
|   ьнее | ||||
|   ьней | ||||
|  по ьнее | ||||
|  по ьней | ||||
|  наи ьнейший | ||||
|  наи ьнейшего | ||||
|  наи ьнейшему | ||||
|  наи ьнейшего | ||||
|  наи ьнейший | ||||
|  наи ьнейшим | ||||
|  наи ьнейшем | ||||
|  наи ьнейшая | ||||
|  наи ьнейшей | ||||
|  наи ьнейшей | ||||
|  наи ьнейшую | ||||
|  наи ьнейшей | ||||
|  наи ьнейшею | ||||
|  наи ьнейшей | ||||
|  наи ьнейшее | ||||
|  наи ьнейшего | ||||
|  наи ьнейшему | ||||
|  наи ьнейшее | ||||
|  наи ьнейшим | ||||
|  наи ьнейшем | ||||
|  наи ьнейшие | ||||
|  наи ьнейших | ||||
|  наи ьнейшим | ||||
|  наи ьнейших | ||||
|  наи ьнейшие | ||||
|  наи ьнейшими | ||||
|  наи ьнейших] | ||||
| @@ -15,12 +15,12 @@ | ||||
|  */ | ||||
| package org.apache.lucene.morphology.english; | ||||
|  | ||||
| import org.apache.lucene.morphology.Morphology; | ||||
| import org.apache.lucene.morphology.MorphologyImpl; | ||||
|  | ||||
| import java.io.IOException; | ||||
|  | ||||
|  | ||||
| public class EnglishMorphology extends Morphology { | ||||
| public class EnglishMorphology extends MorphologyImpl { | ||||
|  | ||||
|     public EnglishMorphology() throws IOException { | ||||
|         super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder()); | ||||
|   | ||||
| @@ -23,7 +23,7 @@ import java.util.ArrayList; | ||||
| import java.util.List; | ||||
|  | ||||
|  | ||||
| public class LuceneMorphology extends MorphologyWithPrefix { | ||||
| public class LuceneMorphology extends MorphologyImpl { | ||||
|  | ||||
|     public LuceneMorphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException { | ||||
|         super(fileName, decoderEncoder); | ||||
| @@ -33,13 +33,15 @@ public class LuceneMorphology extends MorphologyWithPrefix { | ||||
|         super(inputStream, decoderEncoder); | ||||
|     } | ||||
|  | ||||
|     public LuceneMorphology(InputStream morphFormInputStream, InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException { | ||||
|         super(morphFormInputStream, prefixesInputStream, decoderEncoder); | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     protected String createForm(String form, String grammaInfo) { | ||||
|         return form; | ||||
|     public List<String> getMorhInfo(String s) { | ||||
|         ArrayList<String> result = new ArrayList<String>(); | ||||
|         int[] ints = decoderEncoder.encodeToArray(revertWord(s)); | ||||
|         int ruleId = findRuleId(ints); | ||||
|         for (Heuristic h : rules[rulesId[ruleId]]) { | ||||
|             result.add(h.transofrmWord(s)); | ||||
|         } | ||||
|         return result; | ||||
|     } | ||||
|  | ||||
|     protected void readRules(BufferedReader bufferedReader) throws IOException { | ||||
|   | ||||
| @@ -15,200 +15,11 @@ | ||||
|  */ | ||||
| package org.apache.lucene.morphology; | ||||
|  | ||||
|  | ||||
| import java.io.*; | ||||
| import java.util.ArrayList; | ||||
| import java.util.HashSet; | ||||
| import java.util.List; | ||||
|  | ||||
|  | ||||
| public class Morphology { | ||||
|     protected int[][] separators; | ||||
|     protected short[] rulesId; | ||||
|     protected Heuristic[][] rules; | ||||
|     protected String[] grammaInfo; | ||||
|     protected LetterDecoderEncoder decoderEncoder; | ||||
| public interface Morphology { | ||||
|  | ||||
|  | ||||
|     public Morphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException { | ||||
|         readFromFile(fileName); | ||||
|         this.decoderEncoder = decoderEncoder; | ||||
|     } | ||||
|  | ||||
|     public Morphology(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException { | ||||
|         readFromInputStream(inputStream); | ||||
|         this.decoderEncoder = decoderEncoder; | ||||
|     } | ||||
|  | ||||
|     public Morphology(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) { | ||||
|         this.separators = separators; | ||||
|         this.rulesId = rulesId; | ||||
|         this.rules = rules; | ||||
|         this.grammaInfo = grammaInfo; | ||||
|     } | ||||
|  | ||||
|     public int[][] getSeparators() { | ||||
|         return separators; | ||||
|     } | ||||
|  | ||||
|     public short[] getRulesId() { | ||||
|         return rulesId; | ||||
|     } | ||||
|  | ||||
|     public Heuristic[][] getRules() { | ||||
|         return rules; | ||||
|     } | ||||
|  | ||||
|     public String[] getGrammaInfo() { | ||||
|         return grammaInfo; | ||||
|     } | ||||
|  | ||||
|     public List<String> getMorhInfo(String s) { | ||||
|         ArrayList<String> result = new ArrayList<String>(); | ||||
|         int[] ints = decoderEncoder.encodeToArray(revertWord(s)); | ||||
|         int ruleId = findRuleId(ints); | ||||
|         for (Heuristic h : rules[rulesId[ruleId]]) { | ||||
|             result.add(createForm(h.transofrmWord(s),grammaInfo[h.getFormMorphInfo()])); | ||||
|         } | ||||
|         return result; | ||||
|     } | ||||
|  | ||||
|     protected String createForm(String form,String grammaInfo){ | ||||
|         return form+"|"+grammaInfo; | ||||
|     } | ||||
|  | ||||
|     protected int findRuleId(int[] ints) { | ||||
|         int low = 0; | ||||
|         int high = separators.length - 1; | ||||
|         int mid = 0; | ||||
|         while (low <= high) { | ||||
|             mid = (low + high) >>> 1; | ||||
|             int[] midVal = separators[mid]; | ||||
|  | ||||
|             int comResult = compareToInts(ints, midVal); | ||||
|             if (comResult > 0) | ||||
|                 low = mid + 1; | ||||
|             else if (comResult < 0) | ||||
|                 high = mid - 1; | ||||
|             else | ||||
|                 break; | ||||
|         } | ||||
|         if (compareToInts(ints, separators[mid]) >= 0) { | ||||
|             return mid; | ||||
|         } else { | ||||
|             return mid - 1; | ||||
|         } | ||||
|     List<String> getMorhInfo(String s); | ||||
|      | ||||
| } | ||||
|  | ||||
|     private int compareToInts(int[] i1, int[] i2) { | ||||
|         int minLength = Math.min(i1.length, i2.length); | ||||
|         for (int i = 0; i < minLength; i++) { | ||||
|             int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1); | ||||
|             if (i3 != 0) return i3; | ||||
|         } | ||||
|         return i1.length - i2.length; | ||||
|     } | ||||
|  | ||||
|     public void writeToFile(String fileName) throws IOException { | ||||
|         OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8"); | ||||
|         writer.write(separators.length + "\n"); | ||||
|         for (int[] i : separators) { | ||||
|             writer.write(i.length + "\n"); | ||||
|             for (int j : i) { | ||||
|                 writer.write(j + "\n"); | ||||
|             } | ||||
|         } | ||||
|         for (short i : rulesId) { | ||||
|             writer.write(i + "\n"); | ||||
|         } | ||||
|         writer.write(rules.length + "\n"); | ||||
|         for (Heuristic[] heuristics : rules) { | ||||
|             writer.write(heuristics.length + "\n"); | ||||
|             for (Heuristic heuristic : heuristics) { | ||||
|                 writer.write(heuristic.toString() + "\n"); | ||||
|             } | ||||
|         } | ||||
|         writer.write(grammaInfo.length + "\n"); | ||||
|         for (String s : grammaInfo) { | ||||
|             writer.write(s + "\n"); | ||||
|         } | ||||
|         writer.close(); | ||||
|     } | ||||
|  | ||||
|     public void readFromFile(String fileName) throws IOException { | ||||
|         FileInputStream inputStream = new FileInputStream(fileName); | ||||
|         readFromInputStream(inputStream); | ||||
|     } | ||||
|  | ||||
|     private void readFromInputStream(InputStream inputStream) throws IOException { | ||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")); | ||||
|         String s = bufferedReader.readLine(); | ||||
|         Integer amount = Integer.valueOf(s); | ||||
|  | ||||
|         readSeparators(bufferedReader, amount); | ||||
|  | ||||
|         readRulesId(bufferedReader, amount); | ||||
|  | ||||
|         readRules(bufferedReader); | ||||
|         readGrammaInfo(bufferedReader); | ||||
|         bufferedReader.close(); | ||||
|     } | ||||
|  | ||||
|     private void readGrammaInfo(BufferedReader bufferedReader) throws IOException { | ||||
|         String s; | ||||
|         Integer amount; | ||||
|         s = bufferedReader.readLine(); | ||||
|         amount = Integer.valueOf(s); | ||||
|         grammaInfo = new String[amount]; | ||||
|         for (int i = 0; i < amount; i++) { | ||||
|             grammaInfo[i] = bufferedReader.readLine(); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     protected void readRules(BufferedReader bufferedReader) throws IOException { | ||||
|         String s; | ||||
|         Integer amount; | ||||
|         s = bufferedReader.readLine(); | ||||
|         amount = Integer.valueOf(s); | ||||
|         rules = new Heuristic[amount][]; | ||||
|         for (int i = 0; i < amount; i++) { | ||||
|             String s1 = bufferedReader.readLine(); | ||||
|             Integer ruleLenght = Integer.valueOf(s1); | ||||
|             rules[i] = new Heuristic[ruleLenght]; | ||||
|             for (int j = 0; j < ruleLenght; j++) { | ||||
|                 rules[i][j] = new Heuristic(bufferedReader.readLine()); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     private void readRulesId(BufferedReader bufferedReader, Integer amount) throws IOException { | ||||
|         rulesId = new short[amount]; | ||||
|         for (int i = 0; i < amount; i++) { | ||||
|             String s1 = bufferedReader.readLine(); | ||||
|             rulesId[i] = Short.valueOf(s1); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException { | ||||
|         HashSet intetger = new HashSet<Integer>(); | ||||
|         separators = new int[amount][]; | ||||
|         for (int i = 0; i < amount; i++) { | ||||
|             String s1 = bufferedReader.readLine(); | ||||
|             Integer wordLenght = Integer.valueOf(s1); | ||||
|             separators[i] = new int[wordLenght]; | ||||
|             for (int j = 0; j < wordLenght; j++) { | ||||
|                 separators[i][j] = Integer.valueOf(bufferedReader.readLine()); | ||||
|             } | ||||
|             intetger.add(separators[i][0]); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     protected String revertWord(String s) { | ||||
|         String result = ""; | ||||
|         for (int i = 1; i <= s.length(); i++) { | ||||
|             result += s.charAt(s.length() - i); | ||||
|         } | ||||
|         return result; | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -0,0 +1,210 @@ | ||||
| /** | ||||
|  * Copyright 2009 Alexander Kuznetsov | ||||
|  * | ||||
|  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|  * you may not use this file except in compliance with the License. | ||||
|  * You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.lucene.morphology; | ||||
|  | ||||
|  | ||||
| import java.io.*; | ||||
| import java.util.ArrayList; | ||||
| import java.util.HashSet; | ||||
| import java.util.List; | ||||
|  | ||||
|  | ||||
| public class MorphologyImpl implements Morphology { | ||||
|     protected int[][] separators; | ||||
|     protected short[] rulesId; | ||||
|     protected Heuristic[][] rules; | ||||
|     protected String[] grammaInfo; | ||||
|     protected LetterDecoderEncoder decoderEncoder; | ||||
|  | ||||
|  | ||||
|     public MorphologyImpl(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException { | ||||
|         readFromFile(fileName); | ||||
|         this.decoderEncoder = decoderEncoder; | ||||
|     } | ||||
|  | ||||
|     public MorphologyImpl(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException { | ||||
|         readFromInputStream(inputStream); | ||||
|         this.decoderEncoder = decoderEncoder; | ||||
|     } | ||||
|  | ||||
|     public MorphologyImpl(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) { | ||||
|         this.separators = separators; | ||||
|         this.rulesId = rulesId; | ||||
|         this.rules = rules; | ||||
|         this.grammaInfo = grammaInfo; | ||||
|     } | ||||
|  | ||||
|     public int[][] getSeparators() { | ||||
|         return separators; | ||||
|     } | ||||
|  | ||||
|     public short[] getRulesId() { | ||||
|         return rulesId; | ||||
|     } | ||||
|  | ||||
|     public Heuristic[][] getRules() { | ||||
|         return rules; | ||||
|     } | ||||
|  | ||||
|     public String[] getGrammaInfo() { | ||||
|         return grammaInfo; | ||||
|     } | ||||
|  | ||||
|     public List<String> getMorhInfo(String s) { | ||||
|         ArrayList<String> result = new ArrayList<String>(); | ||||
|         int[] ints = decoderEncoder.encodeToArray(revertWord(s)); | ||||
|         int ruleId = findRuleId(ints); | ||||
|         for (Heuristic h : rules[rulesId[ruleId]]) { | ||||
|             result.add(h.transofrmWord(s) + "|" + grammaInfo[h.getFormMorphInfo()]); | ||||
|         } | ||||
|         return result; | ||||
|     } | ||||
|  | ||||
|     protected int findRuleId(int[] ints) { | ||||
|         int low = 0; | ||||
|         int high = separators.length - 1; | ||||
|         int mid = 0; | ||||
|         while (low <= high) { | ||||
|             mid = (low + high) >>> 1; | ||||
|             int[] midVal = separators[mid]; | ||||
|  | ||||
|             int comResult = compareToInts(ints, midVal); | ||||
|             if (comResult > 0) | ||||
|                 low = mid + 1; | ||||
|             else if (comResult < 0) | ||||
|                 high = mid - 1; | ||||
|             else | ||||
|                 break; | ||||
|         } | ||||
|         if (compareToInts(ints, separators[mid]) >= 0) { | ||||
|             return mid; | ||||
|         } else { | ||||
|             return mid - 1; | ||||
|         } | ||||
|  | ||||
|     } | ||||
|  | ||||
|     private int compareToInts(int[] i1, int[] i2) { | ||||
|         int minLength = Math.min(i1.length, i2.length); | ||||
|         for (int i = 0; i < minLength; i++) { | ||||
|             int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1); | ||||
|             if (i3 != 0) return i3; | ||||
|         } | ||||
|         return i1.length - i2.length; | ||||
|     } | ||||
|  | ||||
|     public void writeToFile(String fileName) throws IOException { | ||||
|         OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8"); | ||||
|         writer.write(separators.length + "\n"); | ||||
|         for (int[] i : separators) { | ||||
|             writer.write(i.length + "\n"); | ||||
|             for (int j : i) { | ||||
|                 writer.write(j + "\n"); | ||||
|             } | ||||
|         } | ||||
|         for (short i : rulesId) { | ||||
|             writer.write(i + "\n"); | ||||
|         } | ||||
|         writer.write(rules.length + "\n"); | ||||
|         for (Heuristic[] heuristics : rules) { | ||||
|             writer.write(heuristics.length + "\n"); | ||||
|             for (Heuristic heuristic : heuristics) { | ||||
|                 writer.write(heuristic.toString() + "\n"); | ||||
|             } | ||||
|         } | ||||
|         writer.write(grammaInfo.length + "\n"); | ||||
|         for (String s : grammaInfo) { | ||||
|             writer.write(s + "\n"); | ||||
|         } | ||||
|         writer.close(); | ||||
|     } | ||||
|  | ||||
|     public void readFromFile(String fileName) throws IOException { | ||||
|         FileInputStream inputStream = new FileInputStream(fileName); | ||||
|         readFromInputStream(inputStream); | ||||
|     } | ||||
|  | ||||
|     private void readFromInputStream(InputStream inputStream) throws IOException { | ||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")); | ||||
|         String s = bufferedReader.readLine(); | ||||
|         Integer amount = Integer.valueOf(s); | ||||
|  | ||||
|         readSeparators(bufferedReader, amount); | ||||
|  | ||||
|         readRulesId(bufferedReader, amount); | ||||
|  | ||||
|         readRules(bufferedReader); | ||||
|         readGrammaInfo(bufferedReader); | ||||
|         bufferedReader.close(); | ||||
|     } | ||||
|  | ||||
|     private void readGrammaInfo(BufferedReader bufferedReader) throws IOException { | ||||
|         String s; | ||||
|         Integer amount; | ||||
|         s = bufferedReader.readLine(); | ||||
|         amount = Integer.valueOf(s); | ||||
|         grammaInfo = new String[amount]; | ||||
|         for (int i = 0; i < amount; i++) { | ||||
|             grammaInfo[i] = bufferedReader.readLine(); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     protected void readRules(BufferedReader bufferedReader) throws IOException { | ||||
|         String s; | ||||
|         Integer amount; | ||||
|         s = bufferedReader.readLine(); | ||||
|         amount = Integer.valueOf(s); | ||||
|         rules = new Heuristic[amount][]; | ||||
|         for (int i = 0; i < amount; i++) { | ||||
|             String s1 = bufferedReader.readLine(); | ||||
|             Integer ruleLenght = Integer.valueOf(s1); | ||||
|             rules[i] = new Heuristic[ruleLenght]; | ||||
|             for (int j = 0; j < ruleLenght; j++) { | ||||
|                 rules[i][j] = new Heuristic(bufferedReader.readLine()); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     private void readRulesId(BufferedReader bufferedReader, Integer amount) throws IOException { | ||||
|         rulesId = new short[amount]; | ||||
|         for (int i = 0; i < amount; i++) { | ||||
|             String s1 = bufferedReader.readLine(); | ||||
|             rulesId[i] = Short.valueOf(s1); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException { | ||||
|         HashSet intetger = new HashSet<Integer>(); | ||||
|         separators = new int[amount][]; | ||||
|         for (int i = 0; i < amount; i++) { | ||||
|             String s1 = bufferedReader.readLine(); | ||||
|             Integer wordLenght = Integer.valueOf(s1); | ||||
|             separators[i] = new int[wordLenght]; | ||||
|             for (int j = 0; j < wordLenght; j++) { | ||||
|                 separators[i][j] = Integer.valueOf(bufferedReader.readLine()); | ||||
|             } | ||||
|             intetger.add(separators[i][0]); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     protected String revertWord(String s) { | ||||
|         String result = ""; | ||||
|         for (int i = 1; i <= s.length(); i++) { | ||||
|             result += s.charAt(s.length() - i); | ||||
|         } | ||||
|         return result; | ||||
|     } | ||||
| } | ||||
| @@ -1,96 +0,0 @@ | ||||
| /** | ||||
|  * Copyright 2009 Alexander Kuznetsov | ||||
|  * | ||||
|  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|  * you may not use this file except in compliance with the License. | ||||
|  * You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.lucene.morphology; | ||||
|  | ||||
| import java.io.IOException; | ||||
| import java.io.InputStream; | ||||
| import java.io.BufferedReader; | ||||
| import java.io.InputStreamReader; | ||||
| import java.util.*; | ||||
|  | ||||
|  | ||||
| public class MorphologyWithPrefix extends Morphology { | ||||
|     private Map<String, PrefixRule> prefixRuleMap = new HashMap<String, PrefixRule>(); | ||||
|  | ||||
|     public MorphologyWithPrefix(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException { | ||||
|         super(fileName, decoderEncoder); | ||||
|     } | ||||
|  | ||||
|     public MorphologyWithPrefix(InputStream morphFormInputStream, LetterDecoderEncoder decoderEncoder) throws IOException { | ||||
|         super(morphFormInputStream, decoderEncoder); | ||||
|     } | ||||
|  | ||||
|     public MorphologyWithPrefix(InputStream morphFormInputStream,InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException { | ||||
|         super(morphFormInputStream, decoderEncoder); | ||||
|         readPrefixes(prefixesInputStream); | ||||
|     } | ||||
|  | ||||
|     private void readPrefixes(InputStream inputStream) throws IOException { | ||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")); | ||||
|         Integer prefixAmount = Integer.parseInt(bufferedReader.readLine()); | ||||
|         for(int i = 0; i < prefixAmount;i++){ | ||||
|             PrefixRule prefixRule = readPrefix(bufferedReader); | ||||
|             prefixRuleMap.put(prefixRule.getHashString(),prefixRule); | ||||
|         } | ||||
|         bufferedReader.close(); | ||||
|     } | ||||
|  | ||||
|     private PrefixRule readPrefix(BufferedReader bufferedReader) throws IOException { | ||||
|         PrefixRule prefixRule = new PrefixRule(); | ||||
|         String s = bufferedReader.readLine(); | ||||
|         prefixRule.setPrefix(s); | ||||
|         s = bufferedReader.readLine(); | ||||
|         prefixRule.setLastLetter(s.charAt(0)); | ||||
|         HashSet<Short> morph = new HashSet<Short>(); | ||||
|         int formAmount = Integer.valueOf(bufferedReader.readLine()); | ||||
|         for(int i = 0; i < formAmount; i++){ | ||||
|             morph.add(Short.valueOf(bufferedReader.readLine())); | ||||
|         } | ||||
|         prefixRule.setForms(morph); | ||||
|         return prefixRule; | ||||
|     } | ||||
|  | ||||
|     public MorphologyWithPrefix(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) { | ||||
|         super(separators, rulesId, rules, grammaInfo); | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public List<String> getMorhInfo(String s) { | ||||
|         if (prefixRuleMap.size() == 0 || s.length() < 4) { | ||||
|             return super.getMorhInfo(s); | ||||
|         } | ||||
|         String ruleIndex = "" + s.charAt(0) + s.charAt(s.length() - 1); | ||||
|         PrefixRule prefixRule = prefixRuleMap.get(ruleIndex); | ||||
|         if (prefixRule == null) { | ||||
|             return super.getMorhInfo(s); | ||||
|         } | ||||
|         if (!s.startsWith(prefixRule.getPrefix())) { | ||||
|             return super.getMorhInfo(s); | ||||
|         } | ||||
|         String sWithoutPrefix = s.substring(prefixRule.getPrefix().length()); | ||||
|  | ||||
|         int[] ints = decoderEncoder.encodeToArray(revertWord(sWithoutPrefix)); | ||||
|         int ruleId = findRuleId(ints); | ||||
|          ArrayList<String> result = new ArrayList<String>(); | ||||
|         for (Heuristic h : rules[rulesId[ruleId]]) { | ||||
|             //String morphInfo = grammaInfo[]; | ||||
|             if(prefixRule.getForms().contains(h.getFormMorphInfo())){ | ||||
|                 result.add(createForm(h.transofrmWord(sWithoutPrefix),"pr")); | ||||
|             } | ||||
|         } | ||||
|         return result.size() > 0 ? result : super.getMorhInfo(s); | ||||
|     } | ||||
| } | ||||
| @@ -1,76 +0,0 @@ | ||||
| /** | ||||
|  * Copyright 2009 Alexander Kuznetsov | ||||
|  * | ||||
|  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|  * you may not use this file except in compliance with the License. | ||||
|  * You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.lucene.morphology; | ||||
|  | ||||
| import java.io.Serializable; | ||||
| import java.util.HashSet; | ||||
|  | ||||
|  | ||||
| public class PrefixRule implements Serializable { | ||||
|     private Character lastLetter; | ||||
|     private String prefix; | ||||
|     private HashSet<Short> forms; | ||||
|  | ||||
|     public Character getLastLetter() { | ||||
|         return lastLetter; | ||||
|     } | ||||
|  | ||||
|     public void setLastLetter(Character lastLetter) { | ||||
|         this.lastLetter = lastLetter; | ||||
|     } | ||||
|  | ||||
|     public String getPrefix() { | ||||
|         return prefix; | ||||
|     } | ||||
|  | ||||
|     public void setPrefix(String prefix) { | ||||
|         this.prefix = prefix; | ||||
|     } | ||||
|  | ||||
|     public HashSet<Short> getForms() { | ||||
|         return forms; | ||||
|     } | ||||
|  | ||||
|     public void setForms(HashSet<Short> forms) { | ||||
|         this.forms = forms; | ||||
|     } | ||||
|  | ||||
|     public String getHashString() { | ||||
|         return "" + prefix.charAt(0) + lastLetter; | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public boolean equals(Object o) { | ||||
|         if (this == o) return true; | ||||
|         if (o == null || getClass() != o.getClass()) return false; | ||||
|  | ||||
|         PrefixRule that = (PrefixRule) o; | ||||
|  | ||||
|         if (forms != null ? !forms.equals(that.forms) : that.forms != null) return false; | ||||
|         if (lastLetter != null ? !lastLetter.equals(that.lastLetter) : that.lastLetter != null) return false; | ||||
|         if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false; | ||||
|  | ||||
|         return true; | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public int hashCode() { | ||||
|         int result = lastLetter != null ? lastLetter.hashCode() : 0; | ||||
|         result = 31 * result + (prefix != null ? prefix.hashCode() : 0); | ||||
|         result = 31 * result + (forms != null ? forms.hashCode() : 0); | ||||
|         return result; | ||||
|     } | ||||
| } | ||||
| @@ -22,6 +22,6 @@ import java.io.IOException; | ||||
| public class RussianLuceneMorphology extends LuceneMorphology { | ||||
|  | ||||
|     public RussianLuceneMorphology() throws IOException { | ||||
|         super(RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"),RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/prefixes.info"), new RussianLetterDecoderEncoder()); | ||||
|         super(RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder()); | ||||
|     } | ||||
| } | ||||
| @@ -15,11 +15,11 @@ | ||||
|  */ | ||||
| package org.apache.lucene.morphology.russian; | ||||
|  | ||||
| import org.apache.lucene.morphology.Morphology; | ||||
| import org.apache.lucene.morphology.MorphologyImpl; | ||||
|  | ||||
| import java.io.IOException; | ||||
|  | ||||
| public class RussianMorphology extends Morphology { | ||||
| public class RussianMorphology extends MorphologyImpl { | ||||
|  | ||||
|     public RussianMorphology() throws IOException { | ||||
|         super(RussianMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder()); | ||||
|   | ||||
| @@ -1,60 +0,0 @@ | ||||
| /** | ||||
|  * Copyright 2009 Alexander Kuznetsov | ||||
|  * | ||||
|  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|  * you may not use this file except in compliance with the License. | ||||
|  * You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.lucene.morphology.russian; | ||||
|  | ||||
| import org.apache.lucene.analysis.TokenStream; | ||||
| import org.apache.lucene.analysis.Token; | ||||
|  | ||||
| import java.io.IOException; | ||||
| import java.io.FileInputStream; | ||||
| import java.io.InputStreamReader; | ||||
| import java.util.HashSet; | ||||
|  | ||||
| /** | ||||
|  * Created by IntelliJ IDEA. | ||||
|  * User: akuznetsov | ||||
|  * Date: 31.10.2009 | ||||
|  * Time: 14:01:11 | ||||
|  * To change this template use File | Settings | File Templates. | ||||
|  */ | ||||
| public class TestSpeed { | ||||
|  | ||||
|     public static void main(String[] args) throws IOException { | ||||
|         RussianAnalayzer russianAnalayzer = new RussianAnalayzer(); | ||||
|         bookProccess(russianAnalayzer, "C:/tmp/_Aleksandr_Suhov_Tanets_na_raskalennyih_uglyah1.fb2"); | ||||
|         Long stat = System.currentTimeMillis(); | ||||
|         bookProccess(russianAnalayzer, "C:/tmp/_Aleksandr_Suhov_Tanets_na_raskalennyih_uglyah1.fb2"); | ||||
|         System.out.println("Done in " + (System.currentTimeMillis() - stat)); | ||||
|     } | ||||
|  | ||||
|     private static void bookProccess(RussianAnalayzer russianAnalayzer, String bookName) throws IOException { | ||||
|         FileInputStream inputStream = new FileInputStream(bookName); | ||||
|         TokenStream tokenStream = russianAnalayzer.tokenStream(null,new InputStreamReader(inputStream,"UTF-8")); | ||||
|         final Token reusableToken = new Token(); | ||||
|         long count = 0; | ||||
|         Token nextToken; | ||||
|         for (; ;) { | ||||
|             nextToken = tokenStream.next(reusableToken); | ||||
|            // System.out.println(" " + nextToken.term()); | ||||
|             count++; | ||||
|             if (nextToken == null) { | ||||
|                 break; | ||||
|             } | ||||
|  | ||||
|         } | ||||
|         //System.out.println("Words " + count); | ||||
|     } | ||||
| } | ||||
| @@ -1,96 +0,0 @@ | ||||
| 11 | ||||
| наи | ||||
| е | ||||
| 8 | ||||
| 258 | ||||
| 255 | ||||
| 289 | ||||
| 252 | ||||
| 292 | ||||
| 262 | ||||
| 296 | ||||
| 286 | ||||
| наи | ||||
| и | ||||
| 2 | ||||
| 263 | ||||
| 297 | ||||
| наи | ||||
| ю | ||||
| 4 | ||||
| 250 | ||||
| 249 | ||||
| 283 | ||||
| 284 | ||||
| по | ||||
| й | ||||
| 5 | ||||
| 250 | ||||
| 251 | ||||
| 248 | ||||
| 247 | ||||
| 269 | ||||
| по | ||||
| е | ||||
| 3 | ||||
| 255 | ||||
| 252 | ||||
| 269 | ||||
| наи | ||||
| й | ||||
| 12 | ||||
| 239 | ||||
| 273 | ||||
| 250 | ||||
| 251 | ||||
| 248 | ||||
| 277 | ||||
| 247 | ||||
| 282 | ||||
| 281 | ||||
| 243 | ||||
| 285 | ||||
| 284 | ||||
| наи | ||||
| о | ||||
| 6 | ||||
| 274 | ||||
| 253 | ||||
| 276 | ||||
| 287 | ||||
| 242 | ||||
| 240 | ||||
| наи | ||||
| м | ||||
| 10 | ||||
| 256 | ||||
| 290 | ||||
| 257 | ||||
| 291 | ||||
| 279 | ||||
| 278 | ||||
| 294 | ||||
| 260 | ||||
| 244 | ||||
| 245 | ||||
| наи | ||||
| х | ||||
| 6 | ||||
| 259 | ||||
| 293 | ||||
| 261 | ||||
| 295 | ||||
| 264 | ||||
| 298 | ||||
| наи | ||||
| я | ||||
| 2 | ||||
| 246 | ||||
| 280 | ||||
| наи | ||||
| у | ||||
| 4 | ||||
| 275 | ||||
| 254 | ||||
| 288 | ||||
| 241 | ||||
| @@ -33,7 +33,7 @@ public class RussianLuceneMorphTest { | ||||
|  | ||||
|     @Before | ||||
|     public void setUp() throws IOException { | ||||
|         luceneMorph = new RussianLuceneMorphology(); | ||||
|         luceneMorph = new LuceneMorphology(this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder()); | ||||
|     } | ||||
|  | ||||
|     @Test | ||||
|   | ||||
| @@ -1,4 +1,3 @@ | ||||
| наилучший хороший | ||||
| еду еда ехать | ||||
| тестов тест | ||||
| вина вино вина | ||||
| @@ -18,7 +17,3 @@ | ||||
| лучший хороший | ||||
| на на | ||||
| тест тест тесто | ||||
| спам спам | ||||
| спама спам | ||||
| наигранный наигранный | ||||
| наивный наивный | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 alexander.a.kuznetsov
					alexander.a.kuznetsov