rallback on wrong version of morphology, adding interafce for morphology
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@88 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
16613c543b
commit
1273cf96ed
@ -31,9 +31,9 @@ import java.util.*;
|
|||||||
public class DictonaryReader {
|
public class DictonaryReader {
|
||||||
private String fileName;
|
private String fileName;
|
||||||
private String fileEncoding = "windows-1251";
|
private String fileEncoding = "windows-1251";
|
||||||
protected List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
|
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
|
||||||
protected List<List<String>> wordPrefixes = new ArrayList<List<String>>();
|
private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
|
||||||
protected Set<String> ingnoredForm = new HashSet<String>();
|
private Set<String> ingnoredForm = new HashSet<String>();
|
||||||
|
|
||||||
public DictonaryReader(String fileName, Set<String> ingnoredForm) {
|
public DictonaryReader(String fileName, Set<String> ingnoredForm) {
|
||||||
this.fileName = fileName;
|
this.fileName = fileName;
|
||||||
@ -57,7 +57,7 @@ public class DictonaryReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
protected void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
|
private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
|
||||||
String s = reader.readLine();
|
String s = reader.readLine();
|
||||||
int count = Integer.valueOf(s);
|
int count = Integer.valueOf(s);
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
@ -81,7 +81,7 @@ public class DictonaryReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
protected void sckipBlock(BufferedReader reader) throws IOException {
|
private void sckipBlock(BufferedReader reader) throws IOException {
|
||||||
String s = reader.readLine();
|
String s = reader.readLine();
|
||||||
int count = Integer.valueOf(s);
|
int count = Integer.valueOf(s);
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
@ -90,7 +90,7 @@ public class DictonaryReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
protected void readPrefix(BufferedReader reader) throws IOException {
|
private void readPrefix(BufferedReader reader) throws IOException {
|
||||||
String s = reader.readLine();
|
String s = reader.readLine();
|
||||||
int count = Integer.valueOf(s);
|
int count = Integer.valueOf(s);
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
@ -99,7 +99,7 @@ public class DictonaryReader {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void readFlexias(BufferedReader reader) throws IOException {
|
private void readFlexias(BufferedReader reader) throws IOException {
|
||||||
String s = reader.readLine();
|
String s = reader.readLine();
|
||||||
int count = Integer.valueOf(s);
|
int count = Integer.valueOf(s);
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
@ -112,7 +112,7 @@ public class DictonaryReader {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
|
private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
|
||||||
String[] fl = line.split("\\*");
|
String[] fl = line.split("\\*");
|
||||||
// we inored all forms thats
|
// we inored all forms thats
|
||||||
if (fl.length == 3) {
|
if (fl.length == 3) {
|
||||||
|
@ -60,28 +60,6 @@ public class FlexiaModel {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return prefix + " " + suffix + " " + code;
|
return prefix + " " + suffix;
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object o) {
|
|
||||||
if (this == o) return true;
|
|
||||||
if (o == null || getClass() != o.getClass()) return false;
|
|
||||||
|
|
||||||
FlexiaModel that = (FlexiaModel) o;
|
|
||||||
|
|
||||||
if (code != null ? !code.equals(that.code) : that.code != null) return false;
|
|
||||||
if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false;
|
|
||||||
if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode() {
|
|
||||||
int result = code != null ? code.hashCode() : 0;
|
|
||||||
result = 31 * result + (suffix != null ? suffix.hashCode() : 0);
|
|
||||||
result = 31 * result + (prefix != null ? prefix.hashCode() : 0);
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,139 +0,0 @@
|
|||||||
/**
|
|
||||||
* Copyright 2009 Alexander Kuznetsov
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.morphology.dictionary;
|
|
||||||
|
|
||||||
import org.apache.lucene.morphology.PrefixRule;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
import java.io.*;
|
|
||||||
|
|
||||||
|
|
||||||
public class PrefixesRulesBuilder extends DictonaryReader {
|
|
||||||
private GrammaReader grammaInfo;
|
|
||||||
|
|
||||||
private Map<FlexiaModel,Set<FlexiaModel>> rules = new HashMap<FlexiaModel,Set<FlexiaModel>>();
|
|
||||||
|
|
||||||
public PrefixesRulesBuilder(String fileName, String fileEncoding, Set<String> ingnoredForm) throws IOException {
|
|
||||||
super(fileName, fileEncoding, ingnoredForm);
|
|
||||||
grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void proccess(WordProccessor wordProccessor) throws IOException {
|
|
||||||
super.proccess(wordProccessor);
|
|
||||||
System.out.println(rules.size());
|
|
||||||
System.out.println(rules);
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<PrefixRule> getPrefixRules(){
|
|
||||||
List<PrefixRule> prefixRules = new ArrayList<PrefixRule>();
|
|
||||||
for(FlexiaModel key:rules.keySet()){
|
|
||||||
PrefixRule prefixRule = new PrefixRule();
|
|
||||||
prefixRule.setPrefix(key.getPrefix());
|
|
||||||
prefixRule.setLastLetter(key.getSuffix().charAt(0));
|
|
||||||
HashSet<Short> map = new HashSet<Short>();
|
|
||||||
for(FlexiaModel fm:rules.get(key)){
|
|
||||||
int gi = grammaInfo.getGrammInversIndex().get(fm.getCode());
|
|
||||||
map.add((short) gi);
|
|
||||||
}
|
|
||||||
prefixRule.setForms(map);
|
|
||||||
prefixRules.add(prefixRule);
|
|
||||||
}
|
|
||||||
return prefixRules;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
|
|
||||||
sckipBlock(reader);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected void readPrefix(BufferedReader reader) throws IOException {
|
|
||||||
sckipBlock(reader);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected void readFlexias(BufferedReader reader) throws IOException {
|
|
||||||
super.readFlexias(reader);
|
|
||||||
//todo research flesias
|
|
||||||
for(List<FlexiaModel> fmList:wordsFlexias){
|
|
||||||
research(fmList);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void research(List<FlexiaModel> models) {
|
|
||||||
for(FlexiaModel fm:models){
|
|
||||||
if(fm.getPrefix().length() > 0){
|
|
||||||
testFlexia(models, fm);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void testFlexia(List<FlexiaModel> models, FlexiaModel fm) {
|
|
||||||
for(FlexiaModel com:models){
|
|
||||||
if(com.getSuffix().equals(fm.getSuffix()) && com.getPrefix().length() == 0){
|
|
||||||
Set<FlexiaModel> models1 = rules.get(convertForKey(fm));
|
|
||||||
if(models1 == null){
|
|
||||||
models1 = new HashSet<FlexiaModel>();
|
|
||||||
rules.put(convertForKey(fm),models1);
|
|
||||||
}
|
|
||||||
models1.add(convert(com));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private FlexiaModel convert(FlexiaModel fm){
|
|
||||||
String suf = fm.getSuffix();
|
|
||||||
//if(suf.length() == 1) System.out.println(fm);
|
|
||||||
return new FlexiaModel(fm.getCode(),""+ suf.charAt(suf.length()-1),fm.getPrefix());
|
|
||||||
}
|
|
||||||
|
|
||||||
private FlexiaModel convertForKey(FlexiaModel fm){
|
|
||||||
String suf = fm.getSuffix();
|
|
||||||
//if(suf.length() == 1) System.out.println(fm);
|
|
||||||
return new FlexiaModel("pr",""+ suf.charAt(suf.length()-1),fm.getPrefix());
|
|
||||||
}
|
|
||||||
|
|
||||||
protected void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
|
|
||||||
String[] fl = line.split("\\*");
|
|
||||||
if (fl.length == 3) {
|
|
||||||
flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
|
|
||||||
}
|
|
||||||
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
|
|
||||||
}
|
|
||||||
|
|
||||||
public void savePrefixes(String fileName) throws IOException {
|
|
||||||
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8");
|
|
||||||
List<PrefixRule> prefixRuleList = getPrefixRules();
|
|
||||||
writer.write(prefixRuleList.size()+"\n");
|
|
||||||
for(PrefixRule pr: prefixRuleList){
|
|
||||||
writePrefixRule(writer, pr);
|
|
||||||
}
|
|
||||||
writer.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void writePrefixRule(OutputStreamWriter writer, PrefixRule pr) throws IOException {
|
|
||||||
writer.write(pr.getPrefix()+"\n");
|
|
||||||
writer.write(pr.getLastLetter()+"\n");
|
|
||||||
HashSet<Short> formInfo = pr.getForms();
|
|
||||||
writer.write(formInfo.size()+"\n");
|
|
||||||
for(Short s:formInfo){
|
|
||||||
writer.write(s+"\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -19,7 +19,7 @@ package org.apache.lucene.morphology.dictionary;
|
|||||||
|
|
||||||
import org.apache.lucene.morphology.Heuristic;
|
import org.apache.lucene.morphology.Heuristic;
|
||||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||||
import org.apache.lucene.morphology.Morphology;
|
import org.apache.lucene.morphology.MorphologyImpl;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
@ -119,7 +119,7 @@ public class StatiticsCollector implements WordProccessor {
|
|||||||
prevSet = currentSet;
|
prevSet = currentSet;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Morphology morphology = new Morphology(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray());
|
MorphologyImpl morphology = new MorphologyImpl(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray());
|
||||||
morphology.writeToFile(fileName);
|
morphology.writeToFile(fileName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,39 +0,0 @@
|
|||||||
/**
|
|
||||||
* Copyright 2009 Alexander Kuznetsov
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.lucene.morphology.generator;
|
|
||||||
|
|
||||||
import org.apache.lucene.morphology.dictionary.*;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.HashSet;
|
|
||||||
|
|
||||||
|
|
||||||
public class RussianPrefixesBuilder {
|
|
||||||
public static void main(String[] args) throws IOException {
|
|
||||||
|
|
||||||
PrefixesRulesBuilder dictonaryReader = new PrefixesRulesBuilder("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", "windows-1251",new HashSet<String>());
|
|
||||||
|
|
||||||
|
|
||||||
dictonaryReader.proccess(new WordProccessor() {
|
|
||||||
public void proccess(WordCard wordCard) throws IOException {
|
|
||||||
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
dictonaryReader.savePrefixes("russian/src/main/resources/org/apache/lucene/morphology/russian/prefixes.info");
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,372 +0,0 @@
|
|||||||
[ ть
|
|
||||||
у
|
|
||||||
ем
|
|
||||||
ешь
|
|
||||||
ете
|
|
||||||
ет
|
|
||||||
ут
|
|
||||||
|
|
||||||
ла
|
|
||||||
ло
|
|
||||||
ли
|
|
||||||
я
|
|
||||||
ши
|
|
||||||
ем
|
|
||||||
емте
|
|
||||||
по ай
|
|
||||||
ь
|
|
||||||
по айте
|
|
||||||
ьте
|
|
||||||
ущий
|
|
||||||
ущего
|
|
||||||
ущему
|
|
||||||
ущего
|
|
||||||
ущий
|
|
||||||
ущим
|
|
||||||
ущем
|
|
||||||
ущая
|
|
||||||
ущей
|
|
||||||
ущей
|
|
||||||
ущую
|
|
||||||
ущей
|
|
||||||
ущею
|
|
||||||
ущей
|
|
||||||
ущее
|
|
||||||
ущего
|
|
||||||
ущему
|
|
||||||
ущее
|
|
||||||
ущим
|
|
||||||
ущем
|
|
||||||
ущие
|
|
||||||
ущих
|
|
||||||
ущим
|
|
||||||
ущих
|
|
||||||
ущие
|
|
||||||
ущими
|
|
||||||
ущих
|
|
||||||
ший
|
|
||||||
шего
|
|
||||||
шему
|
|
||||||
шего
|
|
||||||
ший
|
|
||||||
шим
|
|
||||||
шем
|
|
||||||
шая
|
|
||||||
шей
|
|
||||||
шей
|
|
||||||
шую
|
|
||||||
шей
|
|
||||||
шею
|
|
||||||
шей
|
|
||||||
шее
|
|
||||||
шего
|
|
||||||
шему
|
|
||||||
шее
|
|
||||||
шим
|
|
||||||
шем
|
|
||||||
шие
|
|
||||||
ших
|
|
||||||
шим
|
|
||||||
ших
|
|
||||||
шие
|
|
||||||
шими
|
|
||||||
ших]
|
|
||||||
[ большой
|
|
||||||
большого
|
|
||||||
большому
|
|
||||||
большого
|
|
||||||
большой
|
|
||||||
большим
|
|
||||||
большом
|
|
||||||
большая
|
|
||||||
большой
|
|
||||||
большой
|
|
||||||
большую
|
|
||||||
большой
|
|
||||||
большою
|
|
||||||
большой
|
|
||||||
большое
|
|
||||||
большого
|
|
||||||
большому
|
|
||||||
большое
|
|
||||||
большим
|
|
||||||
большом
|
|
||||||
большие
|
|
||||||
больших
|
|
||||||
большим
|
|
||||||
больших
|
|
||||||
большие
|
|
||||||
большими
|
|
||||||
больших
|
|
||||||
велик
|
|
||||||
велика
|
|
||||||
велико
|
|
||||||
велики
|
|
||||||
больше
|
|
||||||
по больше
|
|
||||||
наи больший
|
|
||||||
наи большего
|
|
||||||
наи большему
|
|
||||||
наи большего
|
|
||||||
наи больший
|
|
||||||
наи большим
|
|
||||||
наи большем
|
|
||||||
наи большая
|
|
||||||
наи большей
|
|
||||||
наи большей
|
|
||||||
наи большую
|
|
||||||
наи большей
|
|
||||||
наи большею
|
|
||||||
наи большей
|
|
||||||
наи большее
|
|
||||||
наи большего
|
|
||||||
наи большему
|
|
||||||
наи большее
|
|
||||||
наи большим
|
|
||||||
наи большем
|
|
||||||
наи большие
|
|
||||||
наи больших
|
|
||||||
наи большим
|
|
||||||
наи больших
|
|
||||||
наи большие
|
|
||||||
наи большими
|
|
||||||
наи больших]
|
|
||||||
[ вероятный
|
|
||||||
вероятного
|
|
||||||
вероятному
|
|
||||||
вероятного
|
|
||||||
вероятный
|
|
||||||
вероятным
|
|
||||||
вероятном
|
|
||||||
вероятная
|
|
||||||
вероятной
|
|
||||||
вероятной
|
|
||||||
вероятную
|
|
||||||
вероятной
|
|
||||||
вероятною
|
|
||||||
вероятной
|
|
||||||
вероятное
|
|
||||||
вероятного
|
|
||||||
вероятному
|
|
||||||
вероятное
|
|
||||||
вероятным
|
|
||||||
вероятном
|
|
||||||
вероятные
|
|
||||||
вероятных
|
|
||||||
вероятным
|
|
||||||
вероятных
|
|
||||||
вероятные
|
|
||||||
вероятными
|
|
||||||
вероятных
|
|
||||||
вероятен
|
|
||||||
вероятна
|
|
||||||
вероятно
|
|
||||||
вероятны
|
|
||||||
вероятнее
|
|
||||||
вероятней
|
|
||||||
по вероятнее
|
|
||||||
по вероятней
|
|
||||||
вероятнейший
|
|
||||||
наи невероятнейший
|
|
||||||
вероятнейшего
|
|
||||||
наи невероятнейшего
|
|
||||||
вероятнейшему
|
|
||||||
наи невероятнейшему
|
|
||||||
вероятнейшего
|
|
||||||
наи невероятнейшего
|
|
||||||
вероятнейший
|
|
||||||
наи невероятнейший
|
|
||||||
вероятнейшим
|
|
||||||
наи невероятнейшим
|
|
||||||
вероятнейшем
|
|
||||||
наи невероятнейшем
|
|
||||||
вероятнейшая
|
|
||||||
наи невероятнейшая
|
|
||||||
вероятнейшей
|
|
||||||
наи невероятнейшей
|
|
||||||
вероятнейшей
|
|
||||||
наи невероятнейшей
|
|
||||||
вероятнейшую
|
|
||||||
наи невероятнейшую
|
|
||||||
вероятнейшей
|
|
||||||
вероятнейшею
|
|
||||||
наи невероятнейшей
|
|
||||||
наи невероятнейшею
|
|
||||||
вероятнейшей
|
|
||||||
наи невероятнейшей
|
|
||||||
вероятнейшее
|
|
||||||
наи невероятнейшее
|
|
||||||
вероятнейшего
|
|
||||||
наи невероятнейшего
|
|
||||||
вероятнейшему
|
|
||||||
наи невероятнейшему
|
|
||||||
вероятнейшее
|
|
||||||
наи невероятнейшее
|
|
||||||
вероятнейшим
|
|
||||||
наи невероятнейшим
|
|
||||||
вероятнейшем
|
|
||||||
наи невероятнейшем
|
|
||||||
вероятнейшие
|
|
||||||
наи невероятнейшие
|
|
||||||
вероятнейших
|
|
||||||
наи невероятнейших
|
|
||||||
вероятнейшим
|
|
||||||
наи невероятнейшим
|
|
||||||
вероятнейших
|
|
||||||
наи невероятнейших
|
|
||||||
вероятнейшие
|
|
||||||
наи невероятнейшие
|
|
||||||
вероятнейшими
|
|
||||||
наи невероятнейшими
|
|
||||||
вероятнейших
|
|
||||||
наи невероятнейших]
|
|
||||||
[ аленький
|
|
||||||
аленького
|
|
||||||
аленькому
|
|
||||||
аленького
|
|
||||||
аленький
|
|
||||||
аленьким
|
|
||||||
аленьком
|
|
||||||
аленькая
|
|
||||||
аленькой
|
|
||||||
аленькой
|
|
||||||
аленькую
|
|
||||||
аленькой
|
|
||||||
аленькою
|
|
||||||
аленькой
|
|
||||||
аленькое
|
|
||||||
аленького
|
|
||||||
аленькому
|
|
||||||
аленькое
|
|
||||||
аленьким
|
|
||||||
аленьком
|
|
||||||
аленькие
|
|
||||||
аленьких
|
|
||||||
аленьким
|
|
||||||
аленьких
|
|
||||||
аленькие
|
|
||||||
аленькими
|
|
||||||
аленьких
|
|
||||||
ал
|
|
||||||
ала
|
|
||||||
ало
|
|
||||||
алы
|
|
||||||
еньше
|
|
||||||
по еньше
|
|
||||||
алейший
|
|
||||||
наи еньший
|
|
||||||
алейшего
|
|
||||||
наи еньшего
|
|
||||||
алейшему
|
|
||||||
наи еньшему
|
|
||||||
алейшего
|
|
||||||
наи еньшего
|
|
||||||
алейший
|
|
||||||
наи еньший
|
|
||||||
алейшим
|
|
||||||
наи еньшим
|
|
||||||
алейшем
|
|
||||||
наи еньшем
|
|
||||||
алейшая
|
|
||||||
наи еньшая
|
|
||||||
алейшей
|
|
||||||
наи еньшей
|
|
||||||
алейшей
|
|
||||||
наи еньшей
|
|
||||||
алейшую
|
|
||||||
наи еньшую
|
|
||||||
алейшей
|
|
||||||
алейшею
|
|
||||||
наи еньшей
|
|
||||||
наи еньшею
|
|
||||||
алейшей
|
|
||||||
наи еньшей
|
|
||||||
алейшее
|
|
||||||
наи еньшее
|
|
||||||
алейшего
|
|
||||||
наи еньшего
|
|
||||||
алейшему
|
|
||||||
наи еньшему
|
|
||||||
алейшее
|
|
||||||
наи еньшее
|
|
||||||
алейшим
|
|
||||||
наи еньшим
|
|
||||||
алейшем
|
|
||||||
наи еньшем
|
|
||||||
алейшие
|
|
||||||
наи еньшие
|
|
||||||
алейших
|
|
||||||
наи еньших
|
|
||||||
алейшим
|
|
||||||
наи еньшим
|
|
||||||
алейших
|
|
||||||
наи еньших
|
|
||||||
алейшие
|
|
||||||
наи еньшие
|
|
||||||
алейшими
|
|
||||||
наи еньшими
|
|
||||||
алейших
|
|
||||||
наи еньших]
|
|
||||||
[ ьный
|
|
||||||
ьного
|
|
||||||
ьному
|
|
||||||
ьного
|
|
||||||
ьный
|
|
||||||
ьным
|
|
||||||
ьном
|
|
||||||
ьная
|
|
||||||
ьной
|
|
||||||
ьной
|
|
||||||
ьную
|
|
||||||
ьной
|
|
||||||
ьною
|
|
||||||
ьной
|
|
||||||
ьное
|
|
||||||
ьного
|
|
||||||
ьному
|
|
||||||
ьное
|
|
||||||
ьным
|
|
||||||
ьном
|
|
||||||
ьные
|
|
||||||
ьных
|
|
||||||
ьным
|
|
||||||
ьных
|
|
||||||
ьные
|
|
||||||
ьными
|
|
||||||
ьных
|
|
||||||
ен
|
|
||||||
ьна
|
|
||||||
ьно
|
|
||||||
ьны
|
|
||||||
ьны
|
|
||||||
ьнее
|
|
||||||
ьней
|
|
||||||
по ьнее
|
|
||||||
по ьней
|
|
||||||
наи ьнейший
|
|
||||||
наи ьнейшего
|
|
||||||
наи ьнейшему
|
|
||||||
наи ьнейшего
|
|
||||||
наи ьнейший
|
|
||||||
наи ьнейшим
|
|
||||||
наи ьнейшем
|
|
||||||
наи ьнейшая
|
|
||||||
наи ьнейшей
|
|
||||||
наи ьнейшей
|
|
||||||
наи ьнейшую
|
|
||||||
наи ьнейшей
|
|
||||||
наи ьнейшею
|
|
||||||
наи ьнейшей
|
|
||||||
наи ьнейшее
|
|
||||||
наи ьнейшего
|
|
||||||
наи ьнейшему
|
|
||||||
наи ьнейшее
|
|
||||||
наи ьнейшим
|
|
||||||
наи ьнейшем
|
|
||||||
наи ьнейшие
|
|
||||||
наи ьнейших
|
|
||||||
наи ьнейшим
|
|
||||||
наи ьнейших
|
|
||||||
наи ьнейшие
|
|
||||||
наи ьнейшими
|
|
||||||
наи ьнейших]
|
|
@ -15,12 +15,12 @@
|
|||||||
*/
|
*/
|
||||||
package org.apache.lucene.morphology.english;
|
package org.apache.lucene.morphology.english;
|
||||||
|
|
||||||
import org.apache.lucene.morphology.Morphology;
|
import org.apache.lucene.morphology.MorphologyImpl;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
|
||||||
public class EnglishMorphology extends Morphology {
|
public class EnglishMorphology extends MorphologyImpl {
|
||||||
|
|
||||||
public EnglishMorphology() throws IOException {
|
public EnglishMorphology() throws IOException {
|
||||||
super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
|
super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
|
||||||
|
@ -23,7 +23,7 @@ import java.util.ArrayList;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
public class LuceneMorphology extends MorphologyWithPrefix {
|
public class LuceneMorphology extends MorphologyImpl {
|
||||||
|
|
||||||
public LuceneMorphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
|
public LuceneMorphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
|
||||||
super(fileName, decoderEncoder);
|
super(fileName, decoderEncoder);
|
||||||
@ -33,13 +33,15 @@ public class LuceneMorphology extends MorphologyWithPrefix {
|
|||||||
super(inputStream, decoderEncoder);
|
super(inputStream, decoderEncoder);
|
||||||
}
|
}
|
||||||
|
|
||||||
public LuceneMorphology(InputStream morphFormInputStream, InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
|
|
||||||
super(morphFormInputStream, prefixesInputStream, decoderEncoder);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected String createForm(String form, String grammaInfo) {
|
public List<String> getMorhInfo(String s) {
|
||||||
return form;
|
ArrayList<String> result = new ArrayList<String>();
|
||||||
|
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
|
||||||
|
int ruleId = findRuleId(ints);
|
||||||
|
for (Heuristic h : rules[rulesId[ruleId]]) {
|
||||||
|
result.add(h.transofrmWord(s));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void readRules(BufferedReader bufferedReader) throws IOException {
|
protected void readRules(BufferedReader bufferedReader) throws IOException {
|
||||||
|
@ -15,200 +15,11 @@
|
|||||||
*/
|
*/
|
||||||
package org.apache.lucene.morphology;
|
package org.apache.lucene.morphology;
|
||||||
|
|
||||||
|
|
||||||
import java.io.*;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
public class Morphology {
|
public interface Morphology {
|
||||||
protected int[][] separators;
|
|
||||||
protected short[] rulesId;
|
|
||||||
protected Heuristic[][] rules;
|
|
||||||
protected String[] grammaInfo;
|
|
||||||
protected LetterDecoderEncoder decoderEncoder;
|
|
||||||
|
|
||||||
|
List<String> getMorhInfo(String s);
|
||||||
|
|
||||||
public Morphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
|
|
||||||
readFromFile(fileName);
|
|
||||||
this.decoderEncoder = decoderEncoder;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Morphology(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
|
|
||||||
readFromInputStream(inputStream);
|
|
||||||
this.decoderEncoder = decoderEncoder;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Morphology(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
|
|
||||||
this.separators = separators;
|
|
||||||
this.rulesId = rulesId;
|
|
||||||
this.rules = rules;
|
|
||||||
this.grammaInfo = grammaInfo;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int[][] getSeparators() {
|
|
||||||
return separators;
|
|
||||||
}
|
|
||||||
|
|
||||||
public short[] getRulesId() {
|
|
||||||
return rulesId;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Heuristic[][] getRules() {
|
|
||||||
return rules;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String[] getGrammaInfo() {
|
|
||||||
return grammaInfo;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getMorhInfo(String s) {
|
|
||||||
ArrayList<String> result = new ArrayList<String>();
|
|
||||||
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
|
|
||||||
int ruleId = findRuleId(ints);
|
|
||||||
for (Heuristic h : rules[rulesId[ruleId]]) {
|
|
||||||
result.add(createForm(h.transofrmWord(s),grammaInfo[h.getFormMorphInfo()]));
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected String createForm(String form,String grammaInfo){
|
|
||||||
return form+"|"+grammaInfo;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected int findRuleId(int[] ints) {
|
|
||||||
int low = 0;
|
|
||||||
int high = separators.length - 1;
|
|
||||||
int mid = 0;
|
|
||||||
while (low <= high) {
|
|
||||||
mid = (low + high) >>> 1;
|
|
||||||
int[] midVal = separators[mid];
|
|
||||||
|
|
||||||
int comResult = compareToInts(ints, midVal);
|
|
||||||
if (comResult > 0)
|
|
||||||
low = mid + 1;
|
|
||||||
else if (comResult < 0)
|
|
||||||
high = mid - 1;
|
|
||||||
else
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (compareToInts(ints, separators[mid]) >= 0) {
|
|
||||||
return mid;
|
|
||||||
} else {
|
|
||||||
return mid - 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private int compareToInts(int[] i1, int[] i2) {
|
|
||||||
int minLength = Math.min(i1.length, i2.length);
|
|
||||||
for (int i = 0; i < minLength; i++) {
|
|
||||||
int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1);
|
|
||||||
if (i3 != 0) return i3;
|
|
||||||
}
|
|
||||||
return i1.length - i2.length;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void writeToFile(String fileName) throws IOException {
|
|
||||||
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8");
|
|
||||||
writer.write(separators.length + "\n");
|
|
||||||
for (int[] i : separators) {
|
|
||||||
writer.write(i.length + "\n");
|
|
||||||
for (int j : i) {
|
|
||||||
writer.write(j + "\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (short i : rulesId) {
|
|
||||||
writer.write(i + "\n");
|
|
||||||
}
|
|
||||||
writer.write(rules.length + "\n");
|
|
||||||
for (Heuristic[] heuristics : rules) {
|
|
||||||
writer.write(heuristics.length + "\n");
|
|
||||||
for (Heuristic heuristic : heuristics) {
|
|
||||||
writer.write(heuristic.toString() + "\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
writer.write(grammaInfo.length + "\n");
|
|
||||||
for (String s : grammaInfo) {
|
|
||||||
writer.write(s + "\n");
|
|
||||||
}
|
|
||||||
writer.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void readFromFile(String fileName) throws IOException {
|
|
||||||
FileInputStream inputStream = new FileInputStream(fileName);
|
|
||||||
readFromInputStream(inputStream);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void readFromInputStream(InputStream inputStream) throws IOException {
|
|
||||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
|
|
||||||
String s = bufferedReader.readLine();
|
|
||||||
Integer amount = Integer.valueOf(s);
|
|
||||||
|
|
||||||
readSeparators(bufferedReader, amount);
|
|
||||||
|
|
||||||
readRulesId(bufferedReader, amount);
|
|
||||||
|
|
||||||
readRules(bufferedReader);
|
|
||||||
readGrammaInfo(bufferedReader);
|
|
||||||
bufferedReader.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void readGrammaInfo(BufferedReader bufferedReader) throws IOException {
|
|
||||||
String s;
|
|
||||||
Integer amount;
|
|
||||||
s = bufferedReader.readLine();
|
|
||||||
amount = Integer.valueOf(s);
|
|
||||||
grammaInfo = new String[amount];
|
|
||||||
for (int i = 0; i < amount; i++) {
|
|
||||||
grammaInfo[i] = bufferedReader.readLine();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected void readRules(BufferedReader bufferedReader) throws IOException {
|
|
||||||
String s;
|
|
||||||
Integer amount;
|
|
||||||
s = bufferedReader.readLine();
|
|
||||||
amount = Integer.valueOf(s);
|
|
||||||
rules = new Heuristic[amount][];
|
|
||||||
for (int i = 0; i < amount; i++) {
|
|
||||||
String s1 = bufferedReader.readLine();
|
|
||||||
Integer ruleLenght = Integer.valueOf(s1);
|
|
||||||
rules[i] = new Heuristic[ruleLenght];
|
|
||||||
for (int j = 0; j < ruleLenght; j++) {
|
|
||||||
rules[i][j] = new Heuristic(bufferedReader.readLine());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void readRulesId(BufferedReader bufferedReader, Integer amount) throws IOException {
|
|
||||||
rulesId = new short[amount];
|
|
||||||
for (int i = 0; i < amount; i++) {
|
|
||||||
String s1 = bufferedReader.readLine();
|
|
||||||
rulesId[i] = Short.valueOf(s1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException {
|
|
||||||
HashSet intetger = new HashSet<Integer>();
|
|
||||||
separators = new int[amount][];
|
|
||||||
for (int i = 0; i < amount; i++) {
|
|
||||||
String s1 = bufferedReader.readLine();
|
|
||||||
Integer wordLenght = Integer.valueOf(s1);
|
|
||||||
separators[i] = new int[wordLenght];
|
|
||||||
for (int j = 0; j < wordLenght; j++) {
|
|
||||||
separators[i][j] = Integer.valueOf(bufferedReader.readLine());
|
|
||||||
}
|
|
||||||
intetger.add(separators[i][0]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected String revertWord(String s) {
|
|
||||||
String result = "";
|
|
||||||
for (int i = 1; i <= s.length(); i++) {
|
|
||||||
result += s.charAt(s.length() - i);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,210 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.morphology;
|
||||||
|
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
|
public class MorphologyImpl implements Morphology {
|
||||||
|
protected int[][] separators;
|
||||||
|
protected short[] rulesId;
|
||||||
|
protected Heuristic[][] rules;
|
||||||
|
protected String[] grammaInfo;
|
||||||
|
protected LetterDecoderEncoder decoderEncoder;
|
||||||
|
|
||||||
|
|
||||||
|
public MorphologyImpl(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
|
||||||
|
readFromFile(fileName);
|
||||||
|
this.decoderEncoder = decoderEncoder;
|
||||||
|
}
|
||||||
|
|
||||||
|
public MorphologyImpl(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
|
||||||
|
readFromInputStream(inputStream);
|
||||||
|
this.decoderEncoder = decoderEncoder;
|
||||||
|
}
|
||||||
|
|
||||||
|
public MorphologyImpl(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
|
||||||
|
this.separators = separators;
|
||||||
|
this.rulesId = rulesId;
|
||||||
|
this.rules = rules;
|
||||||
|
this.grammaInfo = grammaInfo;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int[][] getSeparators() {
|
||||||
|
return separators;
|
||||||
|
}
|
||||||
|
|
||||||
|
public short[] getRulesId() {
|
||||||
|
return rulesId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Heuristic[][] getRules() {
|
||||||
|
return rules;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String[] getGrammaInfo() {
|
||||||
|
return grammaInfo;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getMorhInfo(String s) {
|
||||||
|
ArrayList<String> result = new ArrayList<String>();
|
||||||
|
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
|
||||||
|
int ruleId = findRuleId(ints);
|
||||||
|
for (Heuristic h : rules[rulesId[ruleId]]) {
|
||||||
|
result.add(h.transofrmWord(s) + "|" + grammaInfo[h.getFormMorphInfo()]);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected int findRuleId(int[] ints) {
|
||||||
|
int low = 0;
|
||||||
|
int high = separators.length - 1;
|
||||||
|
int mid = 0;
|
||||||
|
while (low <= high) {
|
||||||
|
mid = (low + high) >>> 1;
|
||||||
|
int[] midVal = separators[mid];
|
||||||
|
|
||||||
|
int comResult = compareToInts(ints, midVal);
|
||||||
|
if (comResult > 0)
|
||||||
|
low = mid + 1;
|
||||||
|
else if (comResult < 0)
|
||||||
|
high = mid - 1;
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (compareToInts(ints, separators[mid]) >= 0) {
|
||||||
|
return mid;
|
||||||
|
} else {
|
||||||
|
return mid - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private int compareToInts(int[] i1, int[] i2) {
|
||||||
|
int minLength = Math.min(i1.length, i2.length);
|
||||||
|
for (int i = 0; i < minLength; i++) {
|
||||||
|
int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1);
|
||||||
|
if (i3 != 0) return i3;
|
||||||
|
}
|
||||||
|
return i1.length - i2.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void writeToFile(String fileName) throws IOException {
|
||||||
|
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8");
|
||||||
|
writer.write(separators.length + "\n");
|
||||||
|
for (int[] i : separators) {
|
||||||
|
writer.write(i.length + "\n");
|
||||||
|
for (int j : i) {
|
||||||
|
writer.write(j + "\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (short i : rulesId) {
|
||||||
|
writer.write(i + "\n");
|
||||||
|
}
|
||||||
|
writer.write(rules.length + "\n");
|
||||||
|
for (Heuristic[] heuristics : rules) {
|
||||||
|
writer.write(heuristics.length + "\n");
|
||||||
|
for (Heuristic heuristic : heuristics) {
|
||||||
|
writer.write(heuristic.toString() + "\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
writer.write(grammaInfo.length + "\n");
|
||||||
|
for (String s : grammaInfo) {
|
||||||
|
writer.write(s + "\n");
|
||||||
|
}
|
||||||
|
writer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void readFromFile(String fileName) throws IOException {
|
||||||
|
FileInputStream inputStream = new FileInputStream(fileName);
|
||||||
|
readFromInputStream(inputStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readFromInputStream(InputStream inputStream) throws IOException {
|
||||||
|
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
|
||||||
|
String s = bufferedReader.readLine();
|
||||||
|
Integer amount = Integer.valueOf(s);
|
||||||
|
|
||||||
|
readSeparators(bufferedReader, amount);
|
||||||
|
|
||||||
|
readRulesId(bufferedReader, amount);
|
||||||
|
|
||||||
|
readRules(bufferedReader);
|
||||||
|
readGrammaInfo(bufferedReader);
|
||||||
|
bufferedReader.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readGrammaInfo(BufferedReader bufferedReader) throws IOException {
|
||||||
|
String s;
|
||||||
|
Integer amount;
|
||||||
|
s = bufferedReader.readLine();
|
||||||
|
amount = Integer.valueOf(s);
|
||||||
|
grammaInfo = new String[amount];
|
||||||
|
for (int i = 0; i < amount; i++) {
|
||||||
|
grammaInfo[i] = bufferedReader.readLine();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void readRules(BufferedReader bufferedReader) throws IOException {
|
||||||
|
String s;
|
||||||
|
Integer amount;
|
||||||
|
s = bufferedReader.readLine();
|
||||||
|
amount = Integer.valueOf(s);
|
||||||
|
rules = new Heuristic[amount][];
|
||||||
|
for (int i = 0; i < amount; i++) {
|
||||||
|
String s1 = bufferedReader.readLine();
|
||||||
|
Integer ruleLenght = Integer.valueOf(s1);
|
||||||
|
rules[i] = new Heuristic[ruleLenght];
|
||||||
|
for (int j = 0; j < ruleLenght; j++) {
|
||||||
|
rules[i][j] = new Heuristic(bufferedReader.readLine());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readRulesId(BufferedReader bufferedReader, Integer amount) throws IOException {
|
||||||
|
rulesId = new short[amount];
|
||||||
|
for (int i = 0; i < amount; i++) {
|
||||||
|
String s1 = bufferedReader.readLine();
|
||||||
|
rulesId[i] = Short.valueOf(s1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException {
|
||||||
|
HashSet intetger = new HashSet<Integer>();
|
||||||
|
separators = new int[amount][];
|
||||||
|
for (int i = 0; i < amount; i++) {
|
||||||
|
String s1 = bufferedReader.readLine();
|
||||||
|
Integer wordLenght = Integer.valueOf(s1);
|
||||||
|
separators[i] = new int[wordLenght];
|
||||||
|
for (int j = 0; j < wordLenght; j++) {
|
||||||
|
separators[i][j] = Integer.valueOf(bufferedReader.readLine());
|
||||||
|
}
|
||||||
|
intetger.add(separators[i][0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected String revertWord(String s) {
|
||||||
|
String result = "";
|
||||||
|
for (int i = 1; i <= s.length(); i++) {
|
||||||
|
result += s.charAt(s.length() - i);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
@ -1,96 +0,0 @@
|
|||||||
/**
|
|
||||||
* Copyright 2009 Alexander Kuznetsov
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.morphology;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
|
|
||||||
public class MorphologyWithPrefix extends Morphology {
|
|
||||||
private Map<String, PrefixRule> prefixRuleMap = new HashMap<String, PrefixRule>();
|
|
||||||
|
|
||||||
public MorphologyWithPrefix(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
|
|
||||||
super(fileName, decoderEncoder);
|
|
||||||
}
|
|
||||||
|
|
||||||
public MorphologyWithPrefix(InputStream morphFormInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
|
|
||||||
super(morphFormInputStream, decoderEncoder);
|
|
||||||
}
|
|
||||||
|
|
||||||
public MorphologyWithPrefix(InputStream morphFormInputStream,InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
|
|
||||||
super(morphFormInputStream, decoderEncoder);
|
|
||||||
readPrefixes(prefixesInputStream);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void readPrefixes(InputStream inputStream) throws IOException {
|
|
||||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
|
|
||||||
Integer prefixAmount = Integer.parseInt(bufferedReader.readLine());
|
|
||||||
for(int i = 0; i < prefixAmount;i++){
|
|
||||||
PrefixRule prefixRule = readPrefix(bufferedReader);
|
|
||||||
prefixRuleMap.put(prefixRule.getHashString(),prefixRule);
|
|
||||||
}
|
|
||||||
bufferedReader.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
private PrefixRule readPrefix(BufferedReader bufferedReader) throws IOException {
|
|
||||||
PrefixRule prefixRule = new PrefixRule();
|
|
||||||
String s = bufferedReader.readLine();
|
|
||||||
prefixRule.setPrefix(s);
|
|
||||||
s = bufferedReader.readLine();
|
|
||||||
prefixRule.setLastLetter(s.charAt(0));
|
|
||||||
HashSet<Short> morph = new HashSet<Short>();
|
|
||||||
int formAmount = Integer.valueOf(bufferedReader.readLine());
|
|
||||||
for(int i = 0; i < formAmount; i++){
|
|
||||||
morph.add(Short.valueOf(bufferedReader.readLine()));
|
|
||||||
}
|
|
||||||
prefixRule.setForms(morph);
|
|
||||||
return prefixRule;
|
|
||||||
}
|
|
||||||
|
|
||||||
public MorphologyWithPrefix(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
|
|
||||||
super(separators, rulesId, rules, grammaInfo);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<String> getMorhInfo(String s) {
|
|
||||||
if (prefixRuleMap.size() == 0 || s.length() < 4) {
|
|
||||||
return super.getMorhInfo(s);
|
|
||||||
}
|
|
||||||
String ruleIndex = "" + s.charAt(0) + s.charAt(s.length() - 1);
|
|
||||||
PrefixRule prefixRule = prefixRuleMap.get(ruleIndex);
|
|
||||||
if (prefixRule == null) {
|
|
||||||
return super.getMorhInfo(s);
|
|
||||||
}
|
|
||||||
if (!s.startsWith(prefixRule.getPrefix())) {
|
|
||||||
return super.getMorhInfo(s);
|
|
||||||
}
|
|
||||||
String sWithoutPrefix = s.substring(prefixRule.getPrefix().length());
|
|
||||||
|
|
||||||
int[] ints = decoderEncoder.encodeToArray(revertWord(sWithoutPrefix));
|
|
||||||
int ruleId = findRuleId(ints);
|
|
||||||
ArrayList<String> result = new ArrayList<String>();
|
|
||||||
for (Heuristic h : rules[rulesId[ruleId]]) {
|
|
||||||
//String morphInfo = grammaInfo[];
|
|
||||||
if(prefixRule.getForms().contains(h.getFormMorphInfo())){
|
|
||||||
result.add(createForm(h.transofrmWord(sWithoutPrefix),"pr"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result.size() > 0 ? result : super.getMorhInfo(s);
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,76 +0,0 @@
|
|||||||
/**
|
|
||||||
* Copyright 2009 Alexander Kuznetsov
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.morphology;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.HashSet;
|
|
||||||
|
|
||||||
|
|
||||||
public class PrefixRule implements Serializable {
|
|
||||||
private Character lastLetter;
|
|
||||||
private String prefix;
|
|
||||||
private HashSet<Short> forms;
|
|
||||||
|
|
||||||
public Character getLastLetter() {
|
|
||||||
return lastLetter;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setLastLetter(Character lastLetter) {
|
|
||||||
this.lastLetter = lastLetter;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getPrefix() {
|
|
||||||
return prefix;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setPrefix(String prefix) {
|
|
||||||
this.prefix = prefix;
|
|
||||||
}
|
|
||||||
|
|
||||||
public HashSet<Short> getForms() {
|
|
||||||
return forms;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setForms(HashSet<Short> forms) {
|
|
||||||
this.forms = forms;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getHashString() {
|
|
||||||
return "" + prefix.charAt(0) + lastLetter;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object o) {
|
|
||||||
if (this == o) return true;
|
|
||||||
if (o == null || getClass() != o.getClass()) return false;
|
|
||||||
|
|
||||||
PrefixRule that = (PrefixRule) o;
|
|
||||||
|
|
||||||
if (forms != null ? !forms.equals(that.forms) : that.forms != null) return false;
|
|
||||||
if (lastLetter != null ? !lastLetter.equals(that.lastLetter) : that.lastLetter != null) return false;
|
|
||||||
if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode() {
|
|
||||||
int result = lastLetter != null ? lastLetter.hashCode() : 0;
|
|
||||||
result = 31 * result + (prefix != null ? prefix.hashCode() : 0);
|
|
||||||
result = 31 * result + (forms != null ? forms.hashCode() : 0);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
}
|
|
@ -22,6 +22,6 @@ import java.io.IOException;
|
|||||||
public class RussianLuceneMorphology extends LuceneMorphology {
|
public class RussianLuceneMorphology extends LuceneMorphology {
|
||||||
|
|
||||||
public RussianLuceneMorphology() throws IOException {
|
public RussianLuceneMorphology() throws IOException {
|
||||||
super(RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"),RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/prefixes.info"), new RussianLetterDecoderEncoder());
|
super(RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -15,11 +15,11 @@
|
|||||||
*/
|
*/
|
||||||
package org.apache.lucene.morphology.russian;
|
package org.apache.lucene.morphology.russian;
|
||||||
|
|
||||||
import org.apache.lucene.morphology.Morphology;
|
import org.apache.lucene.morphology.MorphologyImpl;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
public class RussianMorphology extends Morphology {
|
public class RussianMorphology extends MorphologyImpl {
|
||||||
|
|
||||||
public RussianMorphology() throws IOException {
|
public RussianMorphology() throws IOException {
|
||||||
super(RussianMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());
|
super(RussianMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());
|
||||||
|
@ -1,60 +0,0 @@
|
|||||||
/**
|
|
||||||
* Copyright 2009 Alexander Kuznetsov
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.morphology.russian;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.util.HashSet;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Created by IntelliJ IDEA.
|
|
||||||
* User: akuznetsov
|
|
||||||
* Date: 31.10.2009
|
|
||||||
* Time: 14:01:11
|
|
||||||
* To change this template use File | Settings | File Templates.
|
|
||||||
*/
|
|
||||||
public class TestSpeed {
|
|
||||||
|
|
||||||
public static void main(String[] args) throws IOException {
|
|
||||||
RussianAnalayzer russianAnalayzer = new RussianAnalayzer();
|
|
||||||
bookProccess(russianAnalayzer, "C:/tmp/_Aleksandr_Suhov_Tanets_na_raskalennyih_uglyah1.fb2");
|
|
||||||
Long stat = System.currentTimeMillis();
|
|
||||||
bookProccess(russianAnalayzer, "C:/tmp/_Aleksandr_Suhov_Tanets_na_raskalennyih_uglyah1.fb2");
|
|
||||||
System.out.println("Done in " + (System.currentTimeMillis() - stat));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void bookProccess(RussianAnalayzer russianAnalayzer, String bookName) throws IOException {
|
|
||||||
FileInputStream inputStream = new FileInputStream(bookName);
|
|
||||||
TokenStream tokenStream = russianAnalayzer.tokenStream(null,new InputStreamReader(inputStream,"UTF-8"));
|
|
||||||
final Token reusableToken = new Token();
|
|
||||||
long count = 0;
|
|
||||||
Token nextToken;
|
|
||||||
for (; ;) {
|
|
||||||
nextToken = tokenStream.next(reusableToken);
|
|
||||||
// System.out.println(" " + nextToken.term());
|
|
||||||
count++;
|
|
||||||
if (nextToken == null) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
//System.out.println("Words " + count);
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,96 +0,0 @@
|
|||||||
11
|
|
||||||
наи
|
|
||||||
е
|
|
||||||
8
|
|
||||||
258
|
|
||||||
255
|
|
||||||
289
|
|
||||||
252
|
|
||||||
292
|
|
||||||
262
|
|
||||||
296
|
|
||||||
286
|
|
||||||
наи
|
|
||||||
и
|
|
||||||
2
|
|
||||||
263
|
|
||||||
297
|
|
||||||
наи
|
|
||||||
ю
|
|
||||||
4
|
|
||||||
250
|
|
||||||
249
|
|
||||||
283
|
|
||||||
284
|
|
||||||
по
|
|
||||||
й
|
|
||||||
5
|
|
||||||
250
|
|
||||||
251
|
|
||||||
248
|
|
||||||
247
|
|
||||||
269
|
|
||||||
по
|
|
||||||
е
|
|
||||||
3
|
|
||||||
255
|
|
||||||
252
|
|
||||||
269
|
|
||||||
наи
|
|
||||||
й
|
|
||||||
12
|
|
||||||
239
|
|
||||||
273
|
|
||||||
250
|
|
||||||
251
|
|
||||||
248
|
|
||||||
277
|
|
||||||
247
|
|
||||||
282
|
|
||||||
281
|
|
||||||
243
|
|
||||||
285
|
|
||||||
284
|
|
||||||
наи
|
|
||||||
о
|
|
||||||
6
|
|
||||||
274
|
|
||||||
253
|
|
||||||
276
|
|
||||||
287
|
|
||||||
242
|
|
||||||
240
|
|
||||||
наи
|
|
||||||
м
|
|
||||||
10
|
|
||||||
256
|
|
||||||
290
|
|
||||||
257
|
|
||||||
291
|
|
||||||
279
|
|
||||||
278
|
|
||||||
294
|
|
||||||
260
|
|
||||||
244
|
|
||||||
245
|
|
||||||
наи
|
|
||||||
х
|
|
||||||
6
|
|
||||||
259
|
|
||||||
293
|
|
||||||
261
|
|
||||||
295
|
|
||||||
264
|
|
||||||
298
|
|
||||||
наи
|
|
||||||
я
|
|
||||||
2
|
|
||||||
246
|
|
||||||
280
|
|
||||||
наи
|
|
||||||
у
|
|
||||||
4
|
|
||||||
275
|
|
||||||
254
|
|
||||||
288
|
|
||||||
241
|
|
@ -33,7 +33,7 @@ public class RussianLuceneMorphTest {
|
|||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void setUp() throws IOException {
|
public void setUp() throws IOException {
|
||||||
luceneMorph = new RussianLuceneMorphology();
|
luceneMorph = new LuceneMorphology(this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
наилучший хороший
|
|
||||||
еду еда ехать
|
еду еда ехать
|
||||||
тестов тест
|
тестов тест
|
||||||
вина вино вина
|
вина вино вина
|
||||||
@ -18,7 +17,3 @@
|
|||||||
лучший хороший
|
лучший хороший
|
||||||
на на
|
на на
|
||||||
тест тест тесто
|
тест тест тесто
|
||||||
спам спам
|
|
||||||
спама спам
|
|
||||||
наигранный наигранный
|
|
||||||
наивный наивный
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user