working on prefixes hypotities

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@87 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
Alexander.A.Kuznetsov
2009-11-12 22:01:52 +00:00
parent 6246f020fd
commit 16613c543b
10 changed files with 395760 additions and 667245 deletions

View File

@ -23,7 +23,7 @@ import java.util.ArrayList;
import java.util.List;
public class LuceneMorphology extends Morphology {
public class LuceneMorphology extends MorphologyWithPrefix {
public LuceneMorphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
super(fileName, decoderEncoder);
@ -33,6 +33,10 @@ public class LuceneMorphology extends Morphology {
super(inputStream, decoderEncoder);
}
public LuceneMorphology(InputStream morphFormInputStream, InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
super(morphFormInputStream, prefixesInputStream, decoderEncoder);
}
@Override
protected String createForm(String form, String grammaInfo) {
return form;

View File

@ -1,11 +1,25 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import java.io.IOException;
import java.io.InputStream;
import java.util.Map;
import java.util.List;
import java.util.HashMap;
import java.util.ArrayList;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.*;
public class MorphologyWithPrefix extends Morphology {
@ -15,8 +29,38 @@ public class MorphologyWithPrefix extends Morphology {
super(fileName, decoderEncoder);
}
public MorphologyWithPrefix(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
super(inputStream, decoderEncoder);
public MorphologyWithPrefix(InputStream morphFormInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
super(morphFormInputStream, decoderEncoder);
}
public MorphologyWithPrefix(InputStream morphFormInputStream,InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
super(morphFormInputStream, decoderEncoder);
readPrefixes(prefixesInputStream);
}
private void readPrefixes(InputStream inputStream) throws IOException {
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
Integer prefixAmount = Integer.parseInt(bufferedReader.readLine());
for(int i = 0; i < prefixAmount;i++){
PrefixRule prefixRule = readPrefix(bufferedReader);
prefixRuleMap.put(prefixRule.getHashString(),prefixRule);
}
bufferedReader.close();
}
private PrefixRule readPrefix(BufferedReader bufferedReader) throws IOException {
PrefixRule prefixRule = new PrefixRule();
String s = bufferedReader.readLine();
prefixRule.setPrefix(s);
s = bufferedReader.readLine();
prefixRule.setLastLetter(s.charAt(0));
HashSet<Short> morph = new HashSet<Short>();
int formAmount = Integer.valueOf(bufferedReader.readLine());
for(int i = 0; i < formAmount; i++){
morph.add(Short.valueOf(bufferedReader.readLine()));
}
prefixRule.setForms(morph);
return prefixRule;
}
public MorphologyWithPrefix(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
@ -25,7 +69,7 @@ public class MorphologyWithPrefix extends Morphology {
@Override
public List<String> getMorhInfo(String s) {
if (s.length() < 4) {
if (prefixRuleMap.size() == 0 || s.length() < 4) {
return super.getMorhInfo(s);
}
String ruleIndex = "" + s.charAt(0) + s.charAt(s.length() - 1);
@ -33,7 +77,7 @@ public class MorphologyWithPrefix extends Morphology {
if (prefixRule == null) {
return super.getMorhInfo(s);
}
if (s.startsWith(prefixRule.getPrefix())) {
if (!s.startsWith(prefixRule.getPrefix())) {
return super.getMorhInfo(s);
}
String sWithoutPrefix = s.substring(prefixRule.getPrefix().length());
@ -42,8 +86,8 @@ public class MorphologyWithPrefix extends Morphology {
int ruleId = findRuleId(ints);
ArrayList<String> result = new ArrayList<String>();
for (Heuristic h : rules[rulesId[ruleId]]) {
String morphInfo = grammaInfo[h.getFormMorphInfo()];
if(prefixRule.getForms().contains(morphInfo)){
//String morphInfo = grammaInfo[];
if(prefixRule.getForms().contains(h.getFormMorphInfo())){
result.add(createForm(h.transofrmWord(sWithoutPrefix),"pr"));
}
}

View File

@ -1,3 +1,18 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import java.io.Serializable;
@ -7,7 +22,7 @@ import java.util.HashSet;
public class PrefixRule implements Serializable {
private Character lastLetter;
private String prefix;
private HashSet<String> forms;
private HashSet<Short> forms;
public Character getLastLetter() {
return lastLetter;
@ -25,11 +40,11 @@ public class PrefixRule implements Serializable {
this.prefix = prefix;
}
public HashSet<String> getForms() {
public HashSet<Short> getForms() {
return forms;
}
public void setForms(HashSet<String> forms) {
public void setForms(HashSet<Short> forms) {
this.forms = forms;
}