rallback on wrong version of morphology, adding interafce for morphology

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@88 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov
2009-11-17 14:03:59 +00:00
parent 16613c543b
commit 1273cf96ed
19 changed files with 263 additions and 1145 deletions

View File

@ -23,7 +23,7 @@ import java.util.ArrayList;
import java.util.List;
public class LuceneMorphology extends MorphologyWithPrefix {
public class LuceneMorphology extends MorphologyImpl {
public LuceneMorphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
super(fileName, decoderEncoder);
@ -33,13 +33,15 @@ public class LuceneMorphology extends MorphologyWithPrefix {
super(inputStream, decoderEncoder);
}
public LuceneMorphology(InputStream morphFormInputStream, InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
super(morphFormInputStream, prefixesInputStream, decoderEncoder);
}
@Override
protected String createForm(String form, String grammaInfo) {
return form;
public List<String> getMorhInfo(String s) {
ArrayList<String> result = new ArrayList<String>();
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
int ruleId = findRuleId(ints);
for (Heuristic h : rules[rulesId[ruleId]]) {
result.add(h.transofrmWord(s));
}
return result;
}
protected void readRules(BufferedReader bufferedReader) throws IOException {

View File

@ -1,214 +1,25 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import java.io.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
public class Morphology {
protected int[][] separators;
protected short[] rulesId;
protected Heuristic[][] rules;
protected String[] grammaInfo;
protected LetterDecoderEncoder decoderEncoder;
public Morphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
readFromFile(fileName);
this.decoderEncoder = decoderEncoder;
}
public Morphology(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
readFromInputStream(inputStream);
this.decoderEncoder = decoderEncoder;
}
public Morphology(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
this.separators = separators;
this.rulesId = rulesId;
this.rules = rules;
this.grammaInfo = grammaInfo;
}
public int[][] getSeparators() {
return separators;
}
public short[] getRulesId() {
return rulesId;
}
public Heuristic[][] getRules() {
return rules;
}
public String[] getGrammaInfo() {
return grammaInfo;
}
public List<String> getMorhInfo(String s) {
ArrayList<String> result = new ArrayList<String>();
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
int ruleId = findRuleId(ints);
for (Heuristic h : rules[rulesId[ruleId]]) {
result.add(createForm(h.transofrmWord(s),grammaInfo[h.getFormMorphInfo()]));
}
return result;
}
protected String createForm(String form,String grammaInfo){
return form+"|"+grammaInfo;
}
protected int findRuleId(int[] ints) {
int low = 0;
int high = separators.length - 1;
int mid = 0;
while (low <= high) {
mid = (low + high) >>> 1;
int[] midVal = separators[mid];
int comResult = compareToInts(ints, midVal);
if (comResult > 0)
low = mid + 1;
else if (comResult < 0)
high = mid - 1;
else
break;
}
if (compareToInts(ints, separators[mid]) >= 0) {
return mid;
} else {
return mid - 1;
}
}
private int compareToInts(int[] i1, int[] i2) {
int minLength = Math.min(i1.length, i2.length);
for (int i = 0; i < minLength; i++) {
int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1);
if (i3 != 0) return i3;
}
return i1.length - i2.length;
}
public void writeToFile(String fileName) throws IOException {
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8");
writer.write(separators.length + "\n");
for (int[] i : separators) {
writer.write(i.length + "\n");
for (int j : i) {
writer.write(j + "\n");
}
}
for (short i : rulesId) {
writer.write(i + "\n");
}
writer.write(rules.length + "\n");
for (Heuristic[] heuristics : rules) {
writer.write(heuristics.length + "\n");
for (Heuristic heuristic : heuristics) {
writer.write(heuristic.toString() + "\n");
}
}
writer.write(grammaInfo.length + "\n");
for (String s : grammaInfo) {
writer.write(s + "\n");
}
writer.close();
}
public void readFromFile(String fileName) throws IOException {
FileInputStream inputStream = new FileInputStream(fileName);
readFromInputStream(inputStream);
}
private void readFromInputStream(InputStream inputStream) throws IOException {
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
String s = bufferedReader.readLine();
Integer amount = Integer.valueOf(s);
readSeparators(bufferedReader, amount);
readRulesId(bufferedReader, amount);
readRules(bufferedReader);
readGrammaInfo(bufferedReader);
bufferedReader.close();
}
private void readGrammaInfo(BufferedReader bufferedReader) throws IOException {
String s;
Integer amount;
s = bufferedReader.readLine();
amount = Integer.valueOf(s);
grammaInfo = new String[amount];
for (int i = 0; i < amount; i++) {
grammaInfo[i] = bufferedReader.readLine();
}
}
protected void readRules(BufferedReader bufferedReader) throws IOException {
String s;
Integer amount;
s = bufferedReader.readLine();
amount = Integer.valueOf(s);
rules = new Heuristic[amount][];
for (int i = 0; i < amount; i++) {
String s1 = bufferedReader.readLine();
Integer ruleLenght = Integer.valueOf(s1);
rules[i] = new Heuristic[ruleLenght];
for (int j = 0; j < ruleLenght; j++) {
rules[i][j] = new Heuristic(bufferedReader.readLine());
}
}
}
private void readRulesId(BufferedReader bufferedReader, Integer amount) throws IOException {
rulesId = new short[amount];
for (int i = 0; i < amount; i++) {
String s1 = bufferedReader.readLine();
rulesId[i] = Short.valueOf(s1);
}
}
private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException {
HashSet intetger = new HashSet<Integer>();
separators = new int[amount][];
for (int i = 0; i < amount; i++) {
String s1 = bufferedReader.readLine();
Integer wordLenght = Integer.valueOf(s1);
separators[i] = new int[wordLenght];
for (int j = 0; j < wordLenght; j++) {
separators[i][j] = Integer.valueOf(bufferedReader.readLine());
}
intetger.add(separators[i][0]);
}
}
protected String revertWord(String s) {
String result = "";
for (int i = 1; i <= s.length(); i++) {
result += s.charAt(s.length() - i);
}
return result;
}
}
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import java.util.List;
public interface Morphology {
List<String> getMorhInfo(String s);
}

View File

@ -0,0 +1,210 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import java.io.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
public class MorphologyImpl implements Morphology {
protected int[][] separators;
protected short[] rulesId;
protected Heuristic[][] rules;
protected String[] grammaInfo;
protected LetterDecoderEncoder decoderEncoder;
public MorphologyImpl(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
readFromFile(fileName);
this.decoderEncoder = decoderEncoder;
}
public MorphologyImpl(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
readFromInputStream(inputStream);
this.decoderEncoder = decoderEncoder;
}
public MorphologyImpl(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
this.separators = separators;
this.rulesId = rulesId;
this.rules = rules;
this.grammaInfo = grammaInfo;
}
public int[][] getSeparators() {
return separators;
}
public short[] getRulesId() {
return rulesId;
}
public Heuristic[][] getRules() {
return rules;
}
public String[] getGrammaInfo() {
return grammaInfo;
}
public List<String> getMorhInfo(String s) {
ArrayList<String> result = new ArrayList<String>();
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
int ruleId = findRuleId(ints);
for (Heuristic h : rules[rulesId[ruleId]]) {
result.add(h.transofrmWord(s) + "|" + grammaInfo[h.getFormMorphInfo()]);
}
return result;
}
protected int findRuleId(int[] ints) {
int low = 0;
int high = separators.length - 1;
int mid = 0;
while (low <= high) {
mid = (low + high) >>> 1;
int[] midVal = separators[mid];
int comResult = compareToInts(ints, midVal);
if (comResult > 0)
low = mid + 1;
else if (comResult < 0)
high = mid - 1;
else
break;
}
if (compareToInts(ints, separators[mid]) >= 0) {
return mid;
} else {
return mid - 1;
}
}
private int compareToInts(int[] i1, int[] i2) {
int minLength = Math.min(i1.length, i2.length);
for (int i = 0; i < minLength; i++) {
int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1);
if (i3 != 0) return i3;
}
return i1.length - i2.length;
}
public void writeToFile(String fileName) throws IOException {
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8");
writer.write(separators.length + "\n");
for (int[] i : separators) {
writer.write(i.length + "\n");
for (int j : i) {
writer.write(j + "\n");
}
}
for (short i : rulesId) {
writer.write(i + "\n");
}
writer.write(rules.length + "\n");
for (Heuristic[] heuristics : rules) {
writer.write(heuristics.length + "\n");
for (Heuristic heuristic : heuristics) {
writer.write(heuristic.toString() + "\n");
}
}
writer.write(grammaInfo.length + "\n");
for (String s : grammaInfo) {
writer.write(s + "\n");
}
writer.close();
}
public void readFromFile(String fileName) throws IOException {
FileInputStream inputStream = new FileInputStream(fileName);
readFromInputStream(inputStream);
}
private void readFromInputStream(InputStream inputStream) throws IOException {
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
String s = bufferedReader.readLine();
Integer amount = Integer.valueOf(s);
readSeparators(bufferedReader, amount);
readRulesId(bufferedReader, amount);
readRules(bufferedReader);
readGrammaInfo(bufferedReader);
bufferedReader.close();
}
private void readGrammaInfo(BufferedReader bufferedReader) throws IOException {
String s;
Integer amount;
s = bufferedReader.readLine();
amount = Integer.valueOf(s);
grammaInfo = new String[amount];
for (int i = 0; i < amount; i++) {
grammaInfo[i] = bufferedReader.readLine();
}
}
protected void readRules(BufferedReader bufferedReader) throws IOException {
String s;
Integer amount;
s = bufferedReader.readLine();
amount = Integer.valueOf(s);
rules = new Heuristic[amount][];
for (int i = 0; i < amount; i++) {
String s1 = bufferedReader.readLine();
Integer ruleLenght = Integer.valueOf(s1);
rules[i] = new Heuristic[ruleLenght];
for (int j = 0; j < ruleLenght; j++) {
rules[i][j] = new Heuristic(bufferedReader.readLine());
}
}
}
private void readRulesId(BufferedReader bufferedReader, Integer amount) throws IOException {
rulesId = new short[amount];
for (int i = 0; i < amount; i++) {
String s1 = bufferedReader.readLine();
rulesId[i] = Short.valueOf(s1);
}
}
private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException {
HashSet intetger = new HashSet<Integer>();
separators = new int[amount][];
for (int i = 0; i < amount; i++) {
String s1 = bufferedReader.readLine();
Integer wordLenght = Integer.valueOf(s1);
separators[i] = new int[wordLenght];
for (int j = 0; j < wordLenght; j++) {
separators[i][j] = Integer.valueOf(bufferedReader.readLine());
}
intetger.add(separators[i][0]);
}
}
protected String revertWord(String s) {
String result = "";
for (int i = 1; i <= s.length(); i++) {
result += s.charAt(s.length() - i);
}
return result;
}
}

View File

@ -1,96 +0,0 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import java.io.IOException;
import java.io.InputStream;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.*;
public class MorphologyWithPrefix extends Morphology {
private Map<String, PrefixRule> prefixRuleMap = new HashMap<String, PrefixRule>();
public MorphologyWithPrefix(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
super(fileName, decoderEncoder);
}
public MorphologyWithPrefix(InputStream morphFormInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
super(morphFormInputStream, decoderEncoder);
}
public MorphologyWithPrefix(InputStream morphFormInputStream,InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
super(morphFormInputStream, decoderEncoder);
readPrefixes(prefixesInputStream);
}
private void readPrefixes(InputStream inputStream) throws IOException {
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
Integer prefixAmount = Integer.parseInt(bufferedReader.readLine());
for(int i = 0; i < prefixAmount;i++){
PrefixRule prefixRule = readPrefix(bufferedReader);
prefixRuleMap.put(prefixRule.getHashString(),prefixRule);
}
bufferedReader.close();
}
private PrefixRule readPrefix(BufferedReader bufferedReader) throws IOException {
PrefixRule prefixRule = new PrefixRule();
String s = bufferedReader.readLine();
prefixRule.setPrefix(s);
s = bufferedReader.readLine();
prefixRule.setLastLetter(s.charAt(0));
HashSet<Short> morph = new HashSet<Short>();
int formAmount = Integer.valueOf(bufferedReader.readLine());
for(int i = 0; i < formAmount; i++){
morph.add(Short.valueOf(bufferedReader.readLine()));
}
prefixRule.setForms(morph);
return prefixRule;
}
public MorphologyWithPrefix(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
super(separators, rulesId, rules, grammaInfo);
}
@Override
public List<String> getMorhInfo(String s) {
if (prefixRuleMap.size() == 0 || s.length() < 4) {
return super.getMorhInfo(s);
}
String ruleIndex = "" + s.charAt(0) + s.charAt(s.length() - 1);
PrefixRule prefixRule = prefixRuleMap.get(ruleIndex);
if (prefixRule == null) {
return super.getMorhInfo(s);
}
if (!s.startsWith(prefixRule.getPrefix())) {
return super.getMorhInfo(s);
}
String sWithoutPrefix = s.substring(prefixRule.getPrefix().length());
int[] ints = decoderEncoder.encodeToArray(revertWord(sWithoutPrefix));
int ruleId = findRuleId(ints);
ArrayList<String> result = new ArrayList<String>();
for (Heuristic h : rules[rulesId[ruleId]]) {
//String morphInfo = grammaInfo[];
if(prefixRule.getForms().contains(h.getFormMorphInfo())){
result.add(createForm(h.transofrmWord(sWithoutPrefix),"pr"));
}
}
return result.size() > 0 ? result : super.getMorhInfo(s);
}
}

View File

@ -1,76 +0,0 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import java.io.Serializable;
import java.util.HashSet;
public class PrefixRule implements Serializable {
private Character lastLetter;
private String prefix;
private HashSet<Short> forms;
public Character getLastLetter() {
return lastLetter;
}
public void setLastLetter(Character lastLetter) {
this.lastLetter = lastLetter;
}
public String getPrefix() {
return prefix;
}
public void setPrefix(String prefix) {
this.prefix = prefix;
}
public HashSet<Short> getForms() {
return forms;
}
public void setForms(HashSet<Short> forms) {
this.forms = forms;
}
public String getHashString() {
return "" + prefix.charAt(0) + lastLetter;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
PrefixRule that = (PrefixRule) o;
if (forms != null ? !forms.equals(that.forms) : that.forms != null) return false;
if (lastLetter != null ? !lastLetter.equals(that.lastLetter) : that.lastLetter != null) return false;
if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false;
return true;
}
@Override
public int hashCode() {
int result = lastLetter != null ? lastLetter.hashCode() : 0;
result = 31 * result + (prefix != null ? prefix.hashCode() : 0);
result = 31 * result + (forms != null ? forms.hashCode() : 0);
return result;
}
}