working on prefixes hypotities

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@87 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
Alexander.A.Kuznetsov 2009-11-12 22:01:52 +00:00
parent 6246f020fd
commit 16613c543b
10 changed files with 395760 additions and 667245 deletions

View File

@ -1,21 +1,34 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.dictionary;
import org.apache.lucene.morphology.PrefixRule;
import java.util.*;
import java.io.IOException;
import java.io.BufferedReader;
import java.io.*;
public class PrefixesRulesBuilder extends DictonaryReader {
private GrammaReader grammaInfo;
private Map<FlexiaModel,Set<FlexiaModel>> rules = new HashMap<FlexiaModel,Set<FlexiaModel>>();
public PrefixesRulesBuilder(String fileName, Set<String> ingnoredForm) {
super(fileName, ingnoredForm);
}
public PrefixesRulesBuilder(String fileName, String fileEncoding, Set<String> ingnoredForm) {
public PrefixesRulesBuilder(String fileName, String fileEncoding, Set<String> ingnoredForm) throws IOException {
super(fileName, fileEncoding, ingnoredForm);
grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
}
@Override
@ -31,9 +44,10 @@ public class PrefixesRulesBuilder extends DictonaryReader {
PrefixRule prefixRule = new PrefixRule();
prefixRule.setPrefix(key.getPrefix());
prefixRule.setLastLetter(key.getSuffix().charAt(0));
HashSet<String> map = new HashSet<String>();
HashSet<Short> map = new HashSet<Short>();
for(FlexiaModel fm:rules.get(key)){
map.add(fm.getCode());
int gi = grammaInfo.getGrammInversIndex().get(fm.getCode());
map.add((short) gi);
}
prefixRule.setForms(map);
prefixRules.add(prefixRule);
@ -73,10 +87,10 @@ public class PrefixesRulesBuilder extends DictonaryReader {
private void testFlexia(List<FlexiaModel> models, FlexiaModel fm) {
for(FlexiaModel com:models){
if(com.getSuffix().equals(fm.getSuffix()) && com.getPrefix().length() == 0){
Set<FlexiaModel> models1 = rules.get(convert(fm));
Set<FlexiaModel> models1 = rules.get(convertForKey(fm));
if(models1 == null){
models1 = new HashSet<FlexiaModel>();
rules.put(convert(fm),models1);
rules.put(convertForKey(fm),models1);
}
models1.add(convert(com));
}
@ -85,8 +99,14 @@ public class PrefixesRulesBuilder extends DictonaryReader {
private FlexiaModel convert(FlexiaModel fm){
String suf = fm.getSuffix();
if(suf.length() == 1) System.out.println(fm);
return new FlexiaModel(fm.getCode(),""+ suf.charAt(suf.length()-1)+ (suf.length() > 1 ? suf.charAt(suf.length()-2) : ""),fm.getPrefix());
//if(suf.length() == 1) System.out.println(fm);
return new FlexiaModel(fm.getCode(),""+ suf.charAt(suf.length()-1),fm.getPrefix());
}
private FlexiaModel convertForKey(FlexiaModel fm){
String suf = fm.getSuffix();
//if(suf.length() == 1) System.out.println(fm);
return new FlexiaModel("pr",""+ suf.charAt(suf.length()-1),fm.getPrefix());
}
protected void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
@ -97,4 +117,23 @@ public class PrefixesRulesBuilder extends DictonaryReader {
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
}
public void savePrefixes(String fileName) throws IOException {
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8");
List<PrefixRule> prefixRuleList = getPrefixRules();
writer.write(prefixRuleList.size()+"\n");
for(PrefixRule pr: prefixRuleList){
writePrefixRule(writer, pr);
}
writer.close();
}
private void writePrefixRule(OutputStreamWriter writer, PrefixRule pr) throws IOException {
writer.write(pr.getPrefix()+"\n");
writer.write(pr.getLastLetter()+"\n");
HashSet<Short> formInfo = pr.getForms();
writer.write(formInfo.size()+"\n");
for(Short s:formInfo){
writer.write(s+"\n");
}
}
}

View File

@ -24,17 +24,16 @@ import java.util.HashSet;
public class RussianPrefixesBuilder {
public static void main(String[] args) throws IOException {
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
PrefixesRulesBuilder dictonaryReader = new PrefixesRulesBuilder("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
//RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
//StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo, decoderEncoder);
dictonaryReader.proccess(new WordProccessor(){
PrefixesRulesBuilder dictonaryReader = new PrefixesRulesBuilder("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", "windows-1251",new HashSet<String>());
dictonaryReader.proccess(new WordProccessor() {
public void proccess(WordCard wordCard) throws IOException {
//To change body of implemented methods use File | Settings | File Templates.
}
});
//statiticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");
dictonaryReader.savePrefixes("russian/src/main/resources/org/apache/lucene/morphology/russian/prefixes.info");
}
}

View File

@ -23,7 +23,7 @@ import java.util.ArrayList;
import java.util.List;
public class LuceneMorphology extends Morphology {
public class LuceneMorphology extends MorphologyWithPrefix {
public LuceneMorphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
super(fileName, decoderEncoder);
@ -33,6 +33,10 @@ public class LuceneMorphology extends Morphology {
super(inputStream, decoderEncoder);
}
public LuceneMorphology(InputStream morphFormInputStream, InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
super(morphFormInputStream, prefixesInputStream, decoderEncoder);
}
@Override
protected String createForm(String form, String grammaInfo) {
return form;

View File

@ -1,11 +1,25 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import java.io.IOException;
import java.io.InputStream;
import java.util.Map;
import java.util.List;
import java.util.HashMap;
import java.util.ArrayList;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.*;
public class MorphologyWithPrefix extends Morphology {
@ -15,8 +29,38 @@ public class MorphologyWithPrefix extends Morphology {
super(fileName, decoderEncoder);
}
public MorphologyWithPrefix(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
super(inputStream, decoderEncoder);
public MorphologyWithPrefix(InputStream morphFormInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
super(morphFormInputStream, decoderEncoder);
}
public MorphologyWithPrefix(InputStream morphFormInputStream,InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
super(morphFormInputStream, decoderEncoder);
readPrefixes(prefixesInputStream);
}
private void readPrefixes(InputStream inputStream) throws IOException {
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
Integer prefixAmount = Integer.parseInt(bufferedReader.readLine());
for(int i = 0; i < prefixAmount;i++){
PrefixRule prefixRule = readPrefix(bufferedReader);
prefixRuleMap.put(prefixRule.getHashString(),prefixRule);
}
bufferedReader.close();
}
private PrefixRule readPrefix(BufferedReader bufferedReader) throws IOException {
PrefixRule prefixRule = new PrefixRule();
String s = bufferedReader.readLine();
prefixRule.setPrefix(s);
s = bufferedReader.readLine();
prefixRule.setLastLetter(s.charAt(0));
HashSet<Short> morph = new HashSet<Short>();
int formAmount = Integer.valueOf(bufferedReader.readLine());
for(int i = 0; i < formAmount; i++){
morph.add(Short.valueOf(bufferedReader.readLine()));
}
prefixRule.setForms(morph);
return prefixRule;
}
public MorphologyWithPrefix(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
@ -25,7 +69,7 @@ public class MorphologyWithPrefix extends Morphology {
@Override
public List<String> getMorhInfo(String s) {
if (s.length() < 4) {
if (prefixRuleMap.size() == 0 || s.length() < 4) {
return super.getMorhInfo(s);
}
String ruleIndex = "" + s.charAt(0) + s.charAt(s.length() - 1);
@ -33,7 +77,7 @@ public class MorphologyWithPrefix extends Morphology {
if (prefixRule == null) {
return super.getMorhInfo(s);
}
if (s.startsWith(prefixRule.getPrefix())) {
if (!s.startsWith(prefixRule.getPrefix())) {
return super.getMorhInfo(s);
}
String sWithoutPrefix = s.substring(prefixRule.getPrefix().length());
@ -42,8 +86,8 @@ public class MorphologyWithPrefix extends Morphology {
int ruleId = findRuleId(ints);
ArrayList<String> result = new ArrayList<String>();
for (Heuristic h : rules[rulesId[ruleId]]) {
String morphInfo = grammaInfo[h.getFormMorphInfo()];
if(prefixRule.getForms().contains(morphInfo)){
//String morphInfo = grammaInfo[];
if(prefixRule.getForms().contains(h.getFormMorphInfo())){
result.add(createForm(h.transofrmWord(sWithoutPrefix),"pr"));
}
}

View File

@ -1,3 +1,18 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import java.io.Serializable;
@ -7,7 +22,7 @@ import java.util.HashSet;
public class PrefixRule implements Serializable {
private Character lastLetter;
private String prefix;
private HashSet<String> forms;
private HashSet<Short> forms;
public Character getLastLetter() {
return lastLetter;
@ -25,11 +40,11 @@ public class PrefixRule implements Serializable {
this.prefix = prefix;
}
public HashSet<String> getForms() {
public HashSet<Short> getForms() {
return forms;
}
public void setForms(HashSet<String> forms) {
public void setForms(HashSet<Short> forms) {
this.forms = forms;
}

View File

@ -22,6 +22,6 @@ import java.io.IOException;
public class RussianLuceneMorphology extends LuceneMorphology {
public RussianLuceneMorphology() throws IOException {
super(RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());
super(RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"),RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/prefixes.info"), new RussianLetterDecoderEncoder());
}
}

View File

@ -0,0 +1,96 @@
11
наи
е
8
258
255
289
252
292
262
296
286
наи
и
2
263
297
наи
ю
4
250
249
283
284
по
й
5
250
251
248
247
269
по
е
3
255
252
269
наи
й
12
239
273
250
251
248
277
247
282
281
243
285
284
наи
о
6
274
253
276
287
242
240
наи
м
10
256
290
257
291
279
278
294
260
244
245
наи
х
6
259
293
261
295
264
298
наи
я
2
246
280
наи
у
4
275
254
288
241

View File

@ -33,7 +33,7 @@ public class RussianLuceneMorphTest {
@Before
public void setUp() throws IOException {
luceneMorph = new LuceneMorphology(this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());
luceneMorph = new RussianLuceneMorphology();
}
@Test

View File

@ -1,3 +1,4 @@
наилучший хороший
еду еда ехать
тестов тест
вина вино вина