working on prefixes hypotities
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@87 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
@ -23,7 +23,7 @@ import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class LuceneMorphology extends Morphology {
|
||||
public class LuceneMorphology extends MorphologyWithPrefix {
|
||||
|
||||
public LuceneMorphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
|
||||
super(fileName, decoderEncoder);
|
||||
@ -33,6 +33,10 @@ public class LuceneMorphology extends Morphology {
|
||||
super(inputStream, decoderEncoder);
|
||||
}
|
||||
|
||||
public LuceneMorphology(InputStream morphFormInputStream, InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
|
||||
super(morphFormInputStream, prefixesInputStream, decoderEncoder);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String createForm(String form, String grammaInfo) {
|
||||
return form;
|
||||
|
@ -1,11 +1,25 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Map;
|
||||
import java.util.List;
|
||||
import java.util.HashMap;
|
||||
import java.util.ArrayList;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public class MorphologyWithPrefix extends Morphology {
|
||||
@ -15,8 +29,38 @@ public class MorphologyWithPrefix extends Morphology {
|
||||
super(fileName, decoderEncoder);
|
||||
}
|
||||
|
||||
public MorphologyWithPrefix(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
|
||||
super(inputStream, decoderEncoder);
|
||||
public MorphologyWithPrefix(InputStream morphFormInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
|
||||
super(morphFormInputStream, decoderEncoder);
|
||||
}
|
||||
|
||||
public MorphologyWithPrefix(InputStream morphFormInputStream,InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
|
||||
super(morphFormInputStream, decoderEncoder);
|
||||
readPrefixes(prefixesInputStream);
|
||||
}
|
||||
|
||||
private void readPrefixes(InputStream inputStream) throws IOException {
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
|
||||
Integer prefixAmount = Integer.parseInt(bufferedReader.readLine());
|
||||
for(int i = 0; i < prefixAmount;i++){
|
||||
PrefixRule prefixRule = readPrefix(bufferedReader);
|
||||
prefixRuleMap.put(prefixRule.getHashString(),prefixRule);
|
||||
}
|
||||
bufferedReader.close();
|
||||
}
|
||||
|
||||
private PrefixRule readPrefix(BufferedReader bufferedReader) throws IOException {
|
||||
PrefixRule prefixRule = new PrefixRule();
|
||||
String s = bufferedReader.readLine();
|
||||
prefixRule.setPrefix(s);
|
||||
s = bufferedReader.readLine();
|
||||
prefixRule.setLastLetter(s.charAt(0));
|
||||
HashSet<Short> morph = new HashSet<Short>();
|
||||
int formAmount = Integer.valueOf(bufferedReader.readLine());
|
||||
for(int i = 0; i < formAmount; i++){
|
||||
morph.add(Short.valueOf(bufferedReader.readLine()));
|
||||
}
|
||||
prefixRule.setForms(morph);
|
||||
return prefixRule;
|
||||
}
|
||||
|
||||
public MorphologyWithPrefix(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
|
||||
@ -25,7 +69,7 @@ public class MorphologyWithPrefix extends Morphology {
|
||||
|
||||
@Override
|
||||
public List<String> getMorhInfo(String s) {
|
||||
if (s.length() < 4) {
|
||||
if (prefixRuleMap.size() == 0 || s.length() < 4) {
|
||||
return super.getMorhInfo(s);
|
||||
}
|
||||
String ruleIndex = "" + s.charAt(0) + s.charAt(s.length() - 1);
|
||||
@ -33,7 +77,7 @@ public class MorphologyWithPrefix extends Morphology {
|
||||
if (prefixRule == null) {
|
||||
return super.getMorhInfo(s);
|
||||
}
|
||||
if (s.startsWith(prefixRule.getPrefix())) {
|
||||
if (!s.startsWith(prefixRule.getPrefix())) {
|
||||
return super.getMorhInfo(s);
|
||||
}
|
||||
String sWithoutPrefix = s.substring(prefixRule.getPrefix().length());
|
||||
@ -42,8 +86,8 @@ public class MorphologyWithPrefix extends Morphology {
|
||||
int ruleId = findRuleId(ints);
|
||||
ArrayList<String> result = new ArrayList<String>();
|
||||
for (Heuristic h : rules[rulesId[ruleId]]) {
|
||||
String morphInfo = grammaInfo[h.getFormMorphInfo()];
|
||||
if(prefixRule.getForms().contains(morphInfo)){
|
||||
//String morphInfo = grammaInfo[];
|
||||
if(prefixRule.getForms().contains(h.getFormMorphInfo())){
|
||||
result.add(createForm(h.transofrmWord(sWithoutPrefix),"pr"));
|
||||
}
|
||||
}
|
||||
|
@ -1,3 +1,18 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology;
|
||||
|
||||
import java.io.Serializable;
|
||||
@ -7,7 +22,7 @@ import java.util.HashSet;
|
||||
public class PrefixRule implements Serializable {
|
||||
private Character lastLetter;
|
||||
private String prefix;
|
||||
private HashSet<String> forms;
|
||||
private HashSet<Short> forms;
|
||||
|
||||
public Character getLastLetter() {
|
||||
return lastLetter;
|
||||
@ -25,11 +40,11 @@ public class PrefixRule implements Serializable {
|
||||
this.prefix = prefix;
|
||||
}
|
||||
|
||||
public HashSet<String> getForms() {
|
||||
public HashSet<Short> getForms() {
|
||||
return forms;
|
||||
}
|
||||
|
||||
public void setForms(HashSet<String> forms) {
|
||||
public void setForms(HashSet<Short> forms) {
|
||||
this.forms = forms;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user