rallback on wrong version of morphology, adding interafce for morphology

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@88 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov 2009-11-17 14:03:59 +00:00
parent 16613c543b
commit 1273cf96ed
19 changed files with 263 additions and 1145 deletions

0
1.txt
View File

View File

@ -31,9 +31,9 @@ import java.util.*;
public class DictonaryReader { public class DictonaryReader {
private String fileName; private String fileName;
private String fileEncoding = "windows-1251"; private String fileEncoding = "windows-1251";
protected List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>(); private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
protected List<List<String>> wordPrefixes = new ArrayList<List<String>>(); private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
protected Set<String> ingnoredForm = new HashSet<String>(); private Set<String> ingnoredForm = new HashSet<String>();
public DictonaryReader(String fileName, Set<String> ingnoredForm) { public DictonaryReader(String fileName, Set<String> ingnoredForm) {
this.fileName = fileName; this.fileName = fileName;
@ -57,7 +57,7 @@ public class DictonaryReader {
} }
protected void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException { private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
String s = reader.readLine(); String s = reader.readLine();
int count = Integer.valueOf(s); int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
@ -81,7 +81,7 @@ public class DictonaryReader {
} }
protected void sckipBlock(BufferedReader reader) throws IOException { private void sckipBlock(BufferedReader reader) throws IOException {
String s = reader.readLine(); String s = reader.readLine();
int count = Integer.valueOf(s); int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
@ -90,7 +90,7 @@ public class DictonaryReader {
} }
protected void readPrefix(BufferedReader reader) throws IOException { private void readPrefix(BufferedReader reader) throws IOException {
String s = reader.readLine(); String s = reader.readLine();
int count = Integer.valueOf(s); int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
@ -99,7 +99,7 @@ public class DictonaryReader {
} }
} }
protected void readFlexias(BufferedReader reader) throws IOException { private void readFlexias(BufferedReader reader) throws IOException {
String s = reader.readLine(); String s = reader.readLine();
int count = Integer.valueOf(s); int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
@ -112,7 +112,7 @@ public class DictonaryReader {
} }
} }
protected void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) { private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
String[] fl = line.split("\\*"); String[] fl = line.split("\\*");
// we inored all forms thats // we inored all forms thats
if (fl.length == 3) { if (fl.length == 3) {

View File

@ -60,28 +60,6 @@ public class FlexiaModel {
@Override @Override
public String toString() { public String toString() {
return prefix + " " + suffix + " " + code; return prefix + " " + suffix;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
FlexiaModel that = (FlexiaModel) o;
if (code != null ? !code.equals(that.code) : that.code != null) return false;
if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false;
if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false;
return true;
}
@Override
public int hashCode() {
int result = code != null ? code.hashCode() : 0;
result = 31 * result + (suffix != null ? suffix.hashCode() : 0);
result = 31 * result + (prefix != null ? prefix.hashCode() : 0);
return result;
} }
} }

View File

@ -1,139 +0,0 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.dictionary;
import org.apache.lucene.morphology.PrefixRule;
import java.util.*;
import java.io.*;
public class PrefixesRulesBuilder extends DictonaryReader {
private GrammaReader grammaInfo;
private Map<FlexiaModel,Set<FlexiaModel>> rules = new HashMap<FlexiaModel,Set<FlexiaModel>>();
public PrefixesRulesBuilder(String fileName, String fileEncoding, Set<String> ingnoredForm) throws IOException {
super(fileName, fileEncoding, ingnoredForm);
grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
}
@Override
public void proccess(WordProccessor wordProccessor) throws IOException {
super.proccess(wordProccessor);
System.out.println(rules.size());
System.out.println(rules);
}
public List<PrefixRule> getPrefixRules(){
List<PrefixRule> prefixRules = new ArrayList<PrefixRule>();
for(FlexiaModel key:rules.keySet()){
PrefixRule prefixRule = new PrefixRule();
prefixRule.setPrefix(key.getPrefix());
prefixRule.setLastLetter(key.getSuffix().charAt(0));
HashSet<Short> map = new HashSet<Short>();
for(FlexiaModel fm:rules.get(key)){
int gi = grammaInfo.getGrammInversIndex().get(fm.getCode());
map.add((short) gi);
}
prefixRule.setForms(map);
prefixRules.add(prefixRule);
}
return prefixRules;
}
@Override
protected void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
sckipBlock(reader);
}
@Override
protected void readPrefix(BufferedReader reader) throws IOException {
sckipBlock(reader);
}
@Override
protected void readFlexias(BufferedReader reader) throws IOException {
super.readFlexias(reader);
//todo research flesias
for(List<FlexiaModel> fmList:wordsFlexias){
research(fmList);
}
}
private void research(List<FlexiaModel> models) {
for(FlexiaModel fm:models){
if(fm.getPrefix().length() > 0){
testFlexia(models, fm);
}
}
}
private void testFlexia(List<FlexiaModel> models, FlexiaModel fm) {
for(FlexiaModel com:models){
if(com.getSuffix().equals(fm.getSuffix()) && com.getPrefix().length() == 0){
Set<FlexiaModel> models1 = rules.get(convertForKey(fm));
if(models1 == null){
models1 = new HashSet<FlexiaModel>();
rules.put(convertForKey(fm),models1);
}
models1.add(convert(com));
}
}
}
private FlexiaModel convert(FlexiaModel fm){
String suf = fm.getSuffix();
//if(suf.length() == 1) System.out.println(fm);
return new FlexiaModel(fm.getCode(),""+ suf.charAt(suf.length()-1),fm.getPrefix());
}
private FlexiaModel convertForKey(FlexiaModel fm){
String suf = fm.getSuffix();
//if(suf.length() == 1) System.out.println(fm);
return new FlexiaModel("pr",""+ suf.charAt(suf.length()-1),fm.getPrefix());
}
protected void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
String[] fl = line.split("\\*");
if (fl.length == 3) {
flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
}
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
}
public void savePrefixes(String fileName) throws IOException {
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8");
List<PrefixRule> prefixRuleList = getPrefixRules();
writer.write(prefixRuleList.size()+"\n");
for(PrefixRule pr: prefixRuleList){
writePrefixRule(writer, pr);
}
writer.close();
}
private void writePrefixRule(OutputStreamWriter writer, PrefixRule pr) throws IOException {
writer.write(pr.getPrefix()+"\n");
writer.write(pr.getLastLetter()+"\n");
HashSet<Short> formInfo = pr.getForms();
writer.write(formInfo.size()+"\n");
for(Short s:formInfo){
writer.write(s+"\n");
}
}
}

View File

@ -19,7 +19,7 @@ package org.apache.lucene.morphology.dictionary;
import org.apache.lucene.morphology.Heuristic; import org.apache.lucene.morphology.Heuristic;
import org.apache.lucene.morphology.LetterDecoderEncoder; import org.apache.lucene.morphology.LetterDecoderEncoder;
import org.apache.lucene.morphology.Morphology; import org.apache.lucene.morphology.MorphologyImpl;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.*;
@ -119,7 +119,7 @@ public class StatiticsCollector implements WordProccessor {
prevSet = currentSet; prevSet = currentSet;
} }
} }
Morphology morphology = new Morphology(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray()); MorphologyImpl morphology = new MorphologyImpl(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray());
morphology.writeToFile(fileName); morphology.writeToFile(fileName);
} }

View File

@ -1,39 +0,0 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.generator;
import org.apache.lucene.morphology.dictionary.*;
import java.io.IOException;
import java.util.HashSet;
public class RussianPrefixesBuilder {
public static void main(String[] args) throws IOException {
PrefixesRulesBuilder dictonaryReader = new PrefixesRulesBuilder("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", "windows-1251",new HashSet<String>());
dictonaryReader.proccess(new WordProccessor() {
public void proccess(WordCard wordCard) throws IOException {
}
});
dictonaryReader.savePrefixes("russian/src/main/resources/org/apache/lucene/morphology/russian/prefixes.info");
}
}

View File

@ -1,372 +0,0 @@
[ ть
у
ем
ешь
ете
ет
ут
ла
ло
ли
я
ши
ем
емте
по ай
ь
по айте
ьте
ущий
ущего
ущему
ущего
ущий
ущим
ущем
ущая
ущей
ущей
ущую
ущей
ущею
ущей
ущее
ущего
ущему
ущее
ущим
ущем
ущие
ущих
ущим
ущих
ущие
ущими
ущих
ший
шего
шему
шего
ший
шим
шем
шая
шей
шей
шую
шей
шею
шей
шее
шего
шему
шее
шим
шем
шие
ших
шим
ших
шие
шими
ших]
[ большой
большого
большому
большого
большой
большим
большом
большая
большой
большой
большую
большой
большою
большой
большое
большого
большому
большое
большим
большом
большие
больших
большим
больших
большие
большими
больших
велик
велика
велико
велики
больше
по больше
наи больший
наи большего
наи большему
наи большего
наи больший
наи большим
наи большем
наи большая
наи большей
наи большей
наи большую
наи большей
наи большею
наи большей
наи большее
наи большего
наи большему
наи большее
наи большим
наи большем
наи большие
наи больших
наи большим
наи больших
наи большие
наи большими
наи больших]
[ вероятный
вероятного
вероятному
вероятного
вероятный
вероятным
вероятном
вероятная
вероятной
вероятной
вероятную
вероятной
вероятною
вероятной
вероятное
вероятного
вероятному
вероятное
вероятным
вероятном
вероятные
вероятных
вероятным
вероятных
вероятные
вероятными
вероятных
вероятен
вероятна
вероятно
вероятны
вероятнее
вероятней
по вероятнее
по вероятней
вероятнейший
наи невероятнейший
вероятнейшего
наи невероятнейшего
вероятнейшему
наи невероятнейшему
вероятнейшего
наи невероятнейшего
вероятнейший
наи невероятнейший
вероятнейшим
наи невероятнейшим
вероятнейшем
наи невероятнейшем
вероятнейшая
наи невероятнейшая
вероятнейшей
наи невероятнейшей
вероятнейшей
наи невероятнейшей
вероятнейшую
наи невероятнейшую
вероятнейшей
вероятнейшею
наи невероятнейшей
наи невероятнейшею
вероятнейшей
наи невероятнейшей
вероятнейшее
наи невероятнейшее
вероятнейшего
наи невероятнейшего
вероятнейшему
наи невероятнейшему
вероятнейшее
наи невероятнейшее
вероятнейшим
наи невероятнейшим
вероятнейшем
наи невероятнейшем
вероятнейшие
наи невероятнейшие
вероятнейших
наи невероятнейших
вероятнейшим
наи невероятнейшим
вероятнейших
наи невероятнейших
вероятнейшие
наи невероятнейшие
вероятнейшими
наи невероятнейшими
вероятнейших
наи невероятнейших]
[ аленький
аленького
аленькому
аленького
аленький
аленьким
аленьком
аленькая
аленькой
аленькой
аленькую
аленькой
аленькою
аленькой
аленькое
аленького
аленькому
аленькое
аленьким
аленьком
аленькие
аленьких
аленьким
аленьких
аленькие
аленькими
аленьких
ал
ала
ало
алы
еньше
по еньше
алейший
наи еньший
алейшего
наи еньшего
алейшему
наи еньшему
алейшего
наи еньшего
алейший
наи еньший
алейшим
наи еньшим
алейшем
наи еньшем
алейшая
наи еньшая
алейшей
наи еньшей
алейшей
наи еньшей
алейшую
наи еньшую
алейшей
алейшею
наи еньшей
наи еньшею
алейшей
наи еньшей
алейшее
наи еньшее
алейшего
наи еньшего
алейшему
наи еньшему
алейшее
наи еньшее
алейшим
наи еньшим
алейшем
наи еньшем
алейшие
наи еньшие
алейших
наи еньших
алейшим
наи еньшим
алейших
наи еньших
алейшие
наи еньшие
алейшими
наи еньшими
алейших
наи еньших]
[ ьный
ьного
ьному
ьного
ьный
ьным
ьном
ьная
ьной
ьной
ьную
ьной
ьною
ьной
ьное
ьного
ьному
ьное
ьным
ьном
ьные
ьных
ьным
ьных
ьные
ьными
ьных
ен
ьна
ьно
ьны
ьны
ьнее
ьней
по ьнее
по ьней
наи ьнейший
наи ьнейшего
наи ьнейшему
наи ьнейшего
наи ьнейший
наи ьнейшим
наи ьнейшем
наи ьнейшая
наи ьнейшей
наи ьнейшей
наи ьнейшую
наи ьнейшей
наи ьнейшею
наи ьнейшей
наи ьнейшее
наи ьнейшего
наи ьнейшему
наи ьнейшее
наи ьнейшим
наи ьнейшем
наи ьнейшие
наи ьнейших
наи ьнейшим
наи ьнейших
наи ьнейшие
наи ьнейшими
наи ьнейших]

View File

@ -15,12 +15,12 @@
*/ */
package org.apache.lucene.morphology.english; package org.apache.lucene.morphology.english;
import org.apache.lucene.morphology.Morphology; import org.apache.lucene.morphology.MorphologyImpl;
import java.io.IOException; import java.io.IOException;
public class EnglishMorphology extends Morphology { public class EnglishMorphology extends MorphologyImpl {
public EnglishMorphology() throws IOException { public EnglishMorphology() throws IOException {
super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder()); super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());

View File

@ -23,7 +23,7 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
public class LuceneMorphology extends MorphologyWithPrefix { public class LuceneMorphology extends MorphologyImpl {
public LuceneMorphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException { public LuceneMorphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
super(fileName, decoderEncoder); super(fileName, decoderEncoder);
@ -33,13 +33,15 @@ public class LuceneMorphology extends MorphologyWithPrefix {
super(inputStream, decoderEncoder); super(inputStream, decoderEncoder);
} }
public LuceneMorphology(InputStream morphFormInputStream, InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
super(morphFormInputStream, prefixesInputStream, decoderEncoder);
}
@Override @Override
protected String createForm(String form, String grammaInfo) { public List<String> getMorhInfo(String s) {
return form; ArrayList<String> result = new ArrayList<String>();
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
int ruleId = findRuleId(ints);
for (Heuristic h : rules[rulesId[ruleId]]) {
result.add(h.transofrmWord(s));
}
return result;
} }
protected void readRules(BufferedReader bufferedReader) throws IOException { protected void readRules(BufferedReader bufferedReader) throws IOException {

View File

@ -15,200 +15,11 @@
*/ */
package org.apache.lucene.morphology; package org.apache.lucene.morphology;
import java.io.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List; import java.util.List;
public class Morphology { public interface Morphology {
protected int[][] separators;
protected short[] rulesId;
protected Heuristic[][] rules;
protected String[] grammaInfo;
protected LetterDecoderEncoder decoderEncoder;
List<String> getMorhInfo(String s);
public Morphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
readFromFile(fileName);
this.decoderEncoder = decoderEncoder;
}
public Morphology(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
readFromInputStream(inputStream);
this.decoderEncoder = decoderEncoder;
}
public Morphology(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
this.separators = separators;
this.rulesId = rulesId;
this.rules = rules;
this.grammaInfo = grammaInfo;
}
public int[][] getSeparators() {
return separators;
}
public short[] getRulesId() {
return rulesId;
}
public Heuristic[][] getRules() {
return rules;
}
public String[] getGrammaInfo() {
return grammaInfo;
}
public List<String> getMorhInfo(String s) {
ArrayList<String> result = new ArrayList<String>();
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
int ruleId = findRuleId(ints);
for (Heuristic h : rules[rulesId[ruleId]]) {
result.add(createForm(h.transofrmWord(s),grammaInfo[h.getFormMorphInfo()]));
}
return result;
}
protected String createForm(String form,String grammaInfo){
return form+"|"+grammaInfo;
}
protected int findRuleId(int[] ints) {
int low = 0;
int high = separators.length - 1;
int mid = 0;
while (low <= high) {
mid = (low + high) >>> 1;
int[] midVal = separators[mid];
int comResult = compareToInts(ints, midVal);
if (comResult > 0)
low = mid + 1;
else if (comResult < 0)
high = mid - 1;
else
break;
}
if (compareToInts(ints, separators[mid]) >= 0) {
return mid;
} else {
return mid - 1;
}
} }
private int compareToInts(int[] i1, int[] i2) {
int minLength = Math.min(i1.length, i2.length);
for (int i = 0; i < minLength; i++) {
int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1);
if (i3 != 0) return i3;
}
return i1.length - i2.length;
}
public void writeToFile(String fileName) throws IOException {
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8");
writer.write(separators.length + "\n");
for (int[] i : separators) {
writer.write(i.length + "\n");
for (int j : i) {
writer.write(j + "\n");
}
}
for (short i : rulesId) {
writer.write(i + "\n");
}
writer.write(rules.length + "\n");
for (Heuristic[] heuristics : rules) {
writer.write(heuristics.length + "\n");
for (Heuristic heuristic : heuristics) {
writer.write(heuristic.toString() + "\n");
}
}
writer.write(grammaInfo.length + "\n");
for (String s : grammaInfo) {
writer.write(s + "\n");
}
writer.close();
}
public void readFromFile(String fileName) throws IOException {
FileInputStream inputStream = new FileInputStream(fileName);
readFromInputStream(inputStream);
}
private void readFromInputStream(InputStream inputStream) throws IOException {
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
String s = bufferedReader.readLine();
Integer amount = Integer.valueOf(s);
readSeparators(bufferedReader, amount);
readRulesId(bufferedReader, amount);
readRules(bufferedReader);
readGrammaInfo(bufferedReader);
bufferedReader.close();
}
private void readGrammaInfo(BufferedReader bufferedReader) throws IOException {
String s;
Integer amount;
s = bufferedReader.readLine();
amount = Integer.valueOf(s);
grammaInfo = new String[amount];
for (int i = 0; i < amount; i++) {
grammaInfo[i] = bufferedReader.readLine();
}
}
protected void readRules(BufferedReader bufferedReader) throws IOException {
String s;
Integer amount;
s = bufferedReader.readLine();
amount = Integer.valueOf(s);
rules = new Heuristic[amount][];
for (int i = 0; i < amount; i++) {
String s1 = bufferedReader.readLine();
Integer ruleLenght = Integer.valueOf(s1);
rules[i] = new Heuristic[ruleLenght];
for (int j = 0; j < ruleLenght; j++) {
rules[i][j] = new Heuristic(bufferedReader.readLine());
}
}
}
private void readRulesId(BufferedReader bufferedReader, Integer amount) throws IOException {
rulesId = new short[amount];
for (int i = 0; i < amount; i++) {
String s1 = bufferedReader.readLine();
rulesId[i] = Short.valueOf(s1);
}
}
private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException {
HashSet intetger = new HashSet<Integer>();
separators = new int[amount][];
for (int i = 0; i < amount; i++) {
String s1 = bufferedReader.readLine();
Integer wordLenght = Integer.valueOf(s1);
separators[i] = new int[wordLenght];
for (int j = 0; j < wordLenght; j++) {
separators[i][j] = Integer.valueOf(bufferedReader.readLine());
}
intetger.add(separators[i][0]);
}
}
protected String revertWord(String s) {
String result = "";
for (int i = 1; i <= s.length(); i++) {
result += s.charAt(s.length() - i);
}
return result;
}
}

View File

@ -0,0 +1,210 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import java.io.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
public class MorphologyImpl implements Morphology {
protected int[][] separators;
protected short[] rulesId;
protected Heuristic[][] rules;
protected String[] grammaInfo;
protected LetterDecoderEncoder decoderEncoder;
public MorphologyImpl(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
readFromFile(fileName);
this.decoderEncoder = decoderEncoder;
}
public MorphologyImpl(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
readFromInputStream(inputStream);
this.decoderEncoder = decoderEncoder;
}
public MorphologyImpl(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
this.separators = separators;
this.rulesId = rulesId;
this.rules = rules;
this.grammaInfo = grammaInfo;
}
public int[][] getSeparators() {
return separators;
}
public short[] getRulesId() {
return rulesId;
}
public Heuristic[][] getRules() {
return rules;
}
public String[] getGrammaInfo() {
return grammaInfo;
}
public List<String> getMorhInfo(String s) {
ArrayList<String> result = new ArrayList<String>();
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
int ruleId = findRuleId(ints);
for (Heuristic h : rules[rulesId[ruleId]]) {
result.add(h.transofrmWord(s) + "|" + grammaInfo[h.getFormMorphInfo()]);
}
return result;
}
protected int findRuleId(int[] ints) {
int low = 0;
int high = separators.length - 1;
int mid = 0;
while (low <= high) {
mid = (low + high) >>> 1;
int[] midVal = separators[mid];
int comResult = compareToInts(ints, midVal);
if (comResult > 0)
low = mid + 1;
else if (comResult < 0)
high = mid - 1;
else
break;
}
if (compareToInts(ints, separators[mid]) >= 0) {
return mid;
} else {
return mid - 1;
}
}
private int compareToInts(int[] i1, int[] i2) {
int minLength = Math.min(i1.length, i2.length);
for (int i = 0; i < minLength; i++) {
int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1);
if (i3 != 0) return i3;
}
return i1.length - i2.length;
}
public void writeToFile(String fileName) throws IOException {
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8");
writer.write(separators.length + "\n");
for (int[] i : separators) {
writer.write(i.length + "\n");
for (int j : i) {
writer.write(j + "\n");
}
}
for (short i : rulesId) {
writer.write(i + "\n");
}
writer.write(rules.length + "\n");
for (Heuristic[] heuristics : rules) {
writer.write(heuristics.length + "\n");
for (Heuristic heuristic : heuristics) {
writer.write(heuristic.toString() + "\n");
}
}
writer.write(grammaInfo.length + "\n");
for (String s : grammaInfo) {
writer.write(s + "\n");
}
writer.close();
}
public void readFromFile(String fileName) throws IOException {
FileInputStream inputStream = new FileInputStream(fileName);
readFromInputStream(inputStream);
}
private void readFromInputStream(InputStream inputStream) throws IOException {
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
String s = bufferedReader.readLine();
Integer amount = Integer.valueOf(s);
readSeparators(bufferedReader, amount);
readRulesId(bufferedReader, amount);
readRules(bufferedReader);
readGrammaInfo(bufferedReader);
bufferedReader.close();
}
private void readGrammaInfo(BufferedReader bufferedReader) throws IOException {
String s;
Integer amount;
s = bufferedReader.readLine();
amount = Integer.valueOf(s);
grammaInfo = new String[amount];
for (int i = 0; i < amount; i++) {
grammaInfo[i] = bufferedReader.readLine();
}
}
protected void readRules(BufferedReader bufferedReader) throws IOException {
String s;
Integer amount;
s = bufferedReader.readLine();
amount = Integer.valueOf(s);
rules = new Heuristic[amount][];
for (int i = 0; i < amount; i++) {
String s1 = bufferedReader.readLine();
Integer ruleLenght = Integer.valueOf(s1);
rules[i] = new Heuristic[ruleLenght];
for (int j = 0; j < ruleLenght; j++) {
rules[i][j] = new Heuristic(bufferedReader.readLine());
}
}
}
private void readRulesId(BufferedReader bufferedReader, Integer amount) throws IOException {
rulesId = new short[amount];
for (int i = 0; i < amount; i++) {
String s1 = bufferedReader.readLine();
rulesId[i] = Short.valueOf(s1);
}
}
private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException {
HashSet intetger = new HashSet<Integer>();
separators = new int[amount][];
for (int i = 0; i < amount; i++) {
String s1 = bufferedReader.readLine();
Integer wordLenght = Integer.valueOf(s1);
separators[i] = new int[wordLenght];
for (int j = 0; j < wordLenght; j++) {
separators[i][j] = Integer.valueOf(bufferedReader.readLine());
}
intetger.add(separators[i][0]);
}
}
protected String revertWord(String s) {
String result = "";
for (int i = 1; i <= s.length(); i++) {
result += s.charAt(s.length() - i);
}
return result;
}
}

View File

@ -1,96 +0,0 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import java.io.IOException;
import java.io.InputStream;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.*;
public class MorphologyWithPrefix extends Morphology {
private Map<String, PrefixRule> prefixRuleMap = new HashMap<String, PrefixRule>();
public MorphologyWithPrefix(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
super(fileName, decoderEncoder);
}
public MorphologyWithPrefix(InputStream morphFormInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
super(morphFormInputStream, decoderEncoder);
}
public MorphologyWithPrefix(InputStream morphFormInputStream,InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
super(morphFormInputStream, decoderEncoder);
readPrefixes(prefixesInputStream);
}
private void readPrefixes(InputStream inputStream) throws IOException {
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
Integer prefixAmount = Integer.parseInt(bufferedReader.readLine());
for(int i = 0; i < prefixAmount;i++){
PrefixRule prefixRule = readPrefix(bufferedReader);
prefixRuleMap.put(prefixRule.getHashString(),prefixRule);
}
bufferedReader.close();
}
private PrefixRule readPrefix(BufferedReader bufferedReader) throws IOException {
PrefixRule prefixRule = new PrefixRule();
String s = bufferedReader.readLine();
prefixRule.setPrefix(s);
s = bufferedReader.readLine();
prefixRule.setLastLetter(s.charAt(0));
HashSet<Short> morph = new HashSet<Short>();
int formAmount = Integer.valueOf(bufferedReader.readLine());
for(int i = 0; i < formAmount; i++){
morph.add(Short.valueOf(bufferedReader.readLine()));
}
prefixRule.setForms(morph);
return prefixRule;
}
public MorphologyWithPrefix(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
super(separators, rulesId, rules, grammaInfo);
}
@Override
public List<String> getMorhInfo(String s) {
if (prefixRuleMap.size() == 0 || s.length() < 4) {
return super.getMorhInfo(s);
}
String ruleIndex = "" + s.charAt(0) + s.charAt(s.length() - 1);
PrefixRule prefixRule = prefixRuleMap.get(ruleIndex);
if (prefixRule == null) {
return super.getMorhInfo(s);
}
if (!s.startsWith(prefixRule.getPrefix())) {
return super.getMorhInfo(s);
}
String sWithoutPrefix = s.substring(prefixRule.getPrefix().length());
int[] ints = decoderEncoder.encodeToArray(revertWord(sWithoutPrefix));
int ruleId = findRuleId(ints);
ArrayList<String> result = new ArrayList<String>();
for (Heuristic h : rules[rulesId[ruleId]]) {
//String morphInfo = grammaInfo[];
if(prefixRule.getForms().contains(h.getFormMorphInfo())){
result.add(createForm(h.transofrmWord(sWithoutPrefix),"pr"));
}
}
return result.size() > 0 ? result : super.getMorhInfo(s);
}
}

View File

@ -1,76 +0,0 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import java.io.Serializable;
import java.util.HashSet;
public class PrefixRule implements Serializable {
private Character lastLetter;
private String prefix;
private HashSet<Short> forms;
public Character getLastLetter() {
return lastLetter;
}
public void setLastLetter(Character lastLetter) {
this.lastLetter = lastLetter;
}
public String getPrefix() {
return prefix;
}
public void setPrefix(String prefix) {
this.prefix = prefix;
}
public HashSet<Short> getForms() {
return forms;
}
public void setForms(HashSet<Short> forms) {
this.forms = forms;
}
public String getHashString() {
return "" + prefix.charAt(0) + lastLetter;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
PrefixRule that = (PrefixRule) o;
if (forms != null ? !forms.equals(that.forms) : that.forms != null) return false;
if (lastLetter != null ? !lastLetter.equals(that.lastLetter) : that.lastLetter != null) return false;
if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false;
return true;
}
@Override
public int hashCode() {
int result = lastLetter != null ? lastLetter.hashCode() : 0;
result = 31 * result + (prefix != null ? prefix.hashCode() : 0);
result = 31 * result + (forms != null ? forms.hashCode() : 0);
return result;
}
}

View File

@ -22,6 +22,6 @@ import java.io.IOException;
public class RussianLuceneMorphology extends LuceneMorphology { public class RussianLuceneMorphology extends LuceneMorphology {
public RussianLuceneMorphology() throws IOException { public RussianLuceneMorphology() throws IOException {
super(RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"),RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/prefixes.info"), new RussianLetterDecoderEncoder()); super(RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());
} }
} }

View File

@ -15,11 +15,11 @@
*/ */
package org.apache.lucene.morphology.russian; package org.apache.lucene.morphology.russian;
import org.apache.lucene.morphology.Morphology; import org.apache.lucene.morphology.MorphologyImpl;
import java.io.IOException; import java.io.IOException;
public class RussianMorphology extends Morphology { public class RussianMorphology extends MorphologyImpl {
public RussianMorphology() throws IOException { public RussianMorphology() throws IOException {
super(RussianMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder()); super(RussianMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());

View File

@ -1,60 +0,0 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.russian;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import java.io.IOException;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.HashSet;
/**
* Created by IntelliJ IDEA.
* User: akuznetsov
* Date: 31.10.2009
* Time: 14:01:11
* To change this template use File | Settings | File Templates.
*/
public class TestSpeed {
public static void main(String[] args) throws IOException {
RussianAnalayzer russianAnalayzer = new RussianAnalayzer();
bookProccess(russianAnalayzer, "C:/tmp/_Aleksandr_Suhov_Tanets_na_raskalennyih_uglyah1.fb2");
Long stat = System.currentTimeMillis();
bookProccess(russianAnalayzer, "C:/tmp/_Aleksandr_Suhov_Tanets_na_raskalennyih_uglyah1.fb2");
System.out.println("Done in " + (System.currentTimeMillis() - stat));
}
private static void bookProccess(RussianAnalayzer russianAnalayzer, String bookName) throws IOException {
FileInputStream inputStream = new FileInputStream(bookName);
TokenStream tokenStream = russianAnalayzer.tokenStream(null,new InputStreamReader(inputStream,"UTF-8"));
final Token reusableToken = new Token();
long count = 0;
Token nextToken;
for (; ;) {
nextToken = tokenStream.next(reusableToken);
// System.out.println(" " + nextToken.term());
count++;
if (nextToken == null) {
break;
}
}
//System.out.println("Words " + count);
}
}

View File

@ -1,96 +0,0 @@
11
наи
е
8
258
255
289
252
292
262
296
286
наи
и
2
263
297
наи
ю
4
250
249
283
284
по
й
5
250
251
248
247
269
по
е
3
255
252
269
наи
й
12
239
273
250
251
248
277
247
282
281
243
285
284
наи
о
6
274
253
276
287
242
240
наи
м
10
256
290
257
291
279
278
294
260
244
245
наи
х
6
259
293
261
295
264
298
наи
я
2
246
280
наи
у
4
275
254
288
241

View File

@ -33,7 +33,7 @@ public class RussianLuceneMorphTest {
@Before @Before
public void setUp() throws IOException { public void setUp() throws IOException {
luceneMorph = new RussianLuceneMorphology(); luceneMorph = new LuceneMorphology(this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());
} }
@Test @Test

View File

@ -1,4 +1,3 @@
наилучший хороший
еду еда ехать еду еда ехать
тестов тест тестов тест
вина вино вина вина вино вина
@ -18,7 +17,3 @@
лучший хороший лучший хороший
на на на на
тест тест тесто тест тест тесто
спам спам
спама спам
наигранный наигранный
наивный наивный