working on prefixes hypotities
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@87 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
6246f020fd
commit
16613c543b
@ -1,21 +1,34 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
package org.apache.lucene.morphology.dictionary;
|
package org.apache.lucene.morphology.dictionary;
|
||||||
|
|
||||||
import org.apache.lucene.morphology.PrefixRule;
|
import org.apache.lucene.morphology.PrefixRule;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.io.IOException;
|
import java.io.*;
|
||||||
import java.io.BufferedReader;
|
|
||||||
|
|
||||||
|
|
||||||
public class PrefixesRulesBuilder extends DictonaryReader {
|
public class PrefixesRulesBuilder extends DictonaryReader {
|
||||||
|
private GrammaReader grammaInfo;
|
||||||
|
|
||||||
private Map<FlexiaModel,Set<FlexiaModel>> rules = new HashMap<FlexiaModel,Set<FlexiaModel>>();
|
private Map<FlexiaModel,Set<FlexiaModel>> rules = new HashMap<FlexiaModel,Set<FlexiaModel>>();
|
||||||
|
|
||||||
public PrefixesRulesBuilder(String fileName, Set<String> ingnoredForm) {
|
public PrefixesRulesBuilder(String fileName, String fileEncoding, Set<String> ingnoredForm) throws IOException {
|
||||||
super(fileName, ingnoredForm);
|
|
||||||
}
|
|
||||||
|
|
||||||
public PrefixesRulesBuilder(String fileName, String fileEncoding, Set<String> ingnoredForm) {
|
|
||||||
super(fileName, fileEncoding, ingnoredForm);
|
super(fileName, fileEncoding, ingnoredForm);
|
||||||
|
grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -31,9 +44,10 @@ public class PrefixesRulesBuilder extends DictonaryReader {
|
|||||||
PrefixRule prefixRule = new PrefixRule();
|
PrefixRule prefixRule = new PrefixRule();
|
||||||
prefixRule.setPrefix(key.getPrefix());
|
prefixRule.setPrefix(key.getPrefix());
|
||||||
prefixRule.setLastLetter(key.getSuffix().charAt(0));
|
prefixRule.setLastLetter(key.getSuffix().charAt(0));
|
||||||
HashSet<String> map = new HashSet<String>();
|
HashSet<Short> map = new HashSet<Short>();
|
||||||
for(FlexiaModel fm:rules.get(key)){
|
for(FlexiaModel fm:rules.get(key)){
|
||||||
map.add(fm.getCode());
|
int gi = grammaInfo.getGrammInversIndex().get(fm.getCode());
|
||||||
|
map.add((short) gi);
|
||||||
}
|
}
|
||||||
prefixRule.setForms(map);
|
prefixRule.setForms(map);
|
||||||
prefixRules.add(prefixRule);
|
prefixRules.add(prefixRule);
|
||||||
@ -73,10 +87,10 @@ public class PrefixesRulesBuilder extends DictonaryReader {
|
|||||||
private void testFlexia(List<FlexiaModel> models, FlexiaModel fm) {
|
private void testFlexia(List<FlexiaModel> models, FlexiaModel fm) {
|
||||||
for(FlexiaModel com:models){
|
for(FlexiaModel com:models){
|
||||||
if(com.getSuffix().equals(fm.getSuffix()) && com.getPrefix().length() == 0){
|
if(com.getSuffix().equals(fm.getSuffix()) && com.getPrefix().length() == 0){
|
||||||
Set<FlexiaModel> models1 = rules.get(convert(fm));
|
Set<FlexiaModel> models1 = rules.get(convertForKey(fm));
|
||||||
if(models1 == null){
|
if(models1 == null){
|
||||||
models1 = new HashSet<FlexiaModel>();
|
models1 = new HashSet<FlexiaModel>();
|
||||||
rules.put(convert(fm),models1);
|
rules.put(convertForKey(fm),models1);
|
||||||
}
|
}
|
||||||
models1.add(convert(com));
|
models1.add(convert(com));
|
||||||
}
|
}
|
||||||
@ -85,8 +99,14 @@ public class PrefixesRulesBuilder extends DictonaryReader {
|
|||||||
|
|
||||||
private FlexiaModel convert(FlexiaModel fm){
|
private FlexiaModel convert(FlexiaModel fm){
|
||||||
String suf = fm.getSuffix();
|
String suf = fm.getSuffix();
|
||||||
if(suf.length() == 1) System.out.println(fm);
|
//if(suf.length() == 1) System.out.println(fm);
|
||||||
return new FlexiaModel(fm.getCode(),""+ suf.charAt(suf.length()-1)+ (suf.length() > 1 ? suf.charAt(suf.length()-2) : ""),fm.getPrefix());
|
return new FlexiaModel(fm.getCode(),""+ suf.charAt(suf.length()-1),fm.getPrefix());
|
||||||
|
}
|
||||||
|
|
||||||
|
private FlexiaModel convertForKey(FlexiaModel fm){
|
||||||
|
String suf = fm.getSuffix();
|
||||||
|
//if(suf.length() == 1) System.out.println(fm);
|
||||||
|
return new FlexiaModel("pr",""+ suf.charAt(suf.length()-1),fm.getPrefix());
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
|
protected void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
|
||||||
@ -97,4 +117,23 @@ public class PrefixesRulesBuilder extends DictonaryReader {
|
|||||||
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
|
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void savePrefixes(String fileName) throws IOException {
|
||||||
|
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8");
|
||||||
|
List<PrefixRule> prefixRuleList = getPrefixRules();
|
||||||
|
writer.write(prefixRuleList.size()+"\n");
|
||||||
|
for(PrefixRule pr: prefixRuleList){
|
||||||
|
writePrefixRule(writer, pr);
|
||||||
|
}
|
||||||
|
writer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writePrefixRule(OutputStreamWriter writer, PrefixRule pr) throws IOException {
|
||||||
|
writer.write(pr.getPrefix()+"\n");
|
||||||
|
writer.write(pr.getLastLetter()+"\n");
|
||||||
|
HashSet<Short> formInfo = pr.getForms();
|
||||||
|
writer.write(formInfo.size()+"\n");
|
||||||
|
for(Short s:formInfo){
|
||||||
|
writer.write(s+"\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -24,17 +24,16 @@ import java.util.HashSet;
|
|||||||
|
|
||||||
public class RussianPrefixesBuilder {
|
public class RussianPrefixesBuilder {
|
||||||
public static void main(String[] args) throws IOException {
|
public static void main(String[] args) throws IOException {
|
||||||
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
|
|
||||||
PrefixesRulesBuilder dictonaryReader = new PrefixesRulesBuilder("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
|
|
||||||
|
|
||||||
//RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
PrefixesRulesBuilder dictonaryReader = new PrefixesRulesBuilder("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", "windows-1251",new HashSet<String>());
|
||||||
//StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo, decoderEncoder);
|
|
||||||
|
|
||||||
dictonaryReader.proccess(new WordProccessor() {
|
dictonaryReader.proccess(new WordProccessor() {
|
||||||
public void proccess(WordCard wordCard) throws IOException {
|
public void proccess(WordCard wordCard) throws IOException {
|
||||||
//To change body of implemented methods use File | Settings | File Templates.
|
|
||||||
}
|
|
||||||
});
|
|
||||||
//statiticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
dictonaryReader.savePrefixes("russian/src/main/resources/org/apache/lucene/morphology/russian/prefixes.info");
|
||||||
|
}
|
||||||
}
|
}
|
@ -23,7 +23,7 @@ import java.util.ArrayList;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
public class LuceneMorphology extends Morphology {
|
public class LuceneMorphology extends MorphologyWithPrefix {
|
||||||
|
|
||||||
public LuceneMorphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
|
public LuceneMorphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
|
||||||
super(fileName, decoderEncoder);
|
super(fileName, decoderEncoder);
|
||||||
@ -33,6 +33,10 @@ public class LuceneMorphology extends Morphology {
|
|||||||
super(inputStream, decoderEncoder);
|
super(inputStream, decoderEncoder);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public LuceneMorphology(InputStream morphFormInputStream, InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
|
||||||
|
super(morphFormInputStream, prefixesInputStream, decoderEncoder);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected String createForm(String form, String grammaInfo) {
|
protected String createForm(String form, String grammaInfo) {
|
||||||
return form;
|
return form;
|
||||||
|
@ -1,11 +1,25 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
package org.apache.lucene.morphology;
|
package org.apache.lucene.morphology;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.util.Map;
|
import java.io.BufferedReader;
|
||||||
import java.util.List;
|
import java.io.InputStreamReader;
|
||||||
import java.util.HashMap;
|
import java.util.*;
|
||||||
import java.util.ArrayList;
|
|
||||||
|
|
||||||
|
|
||||||
public class MorphologyWithPrefix extends Morphology {
|
public class MorphologyWithPrefix extends Morphology {
|
||||||
@ -15,8 +29,38 @@ public class MorphologyWithPrefix extends Morphology {
|
|||||||
super(fileName, decoderEncoder);
|
super(fileName, decoderEncoder);
|
||||||
}
|
}
|
||||||
|
|
||||||
public MorphologyWithPrefix(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
|
public MorphologyWithPrefix(InputStream morphFormInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
|
||||||
super(inputStream, decoderEncoder);
|
super(morphFormInputStream, decoderEncoder);
|
||||||
|
}
|
||||||
|
|
||||||
|
public MorphologyWithPrefix(InputStream morphFormInputStream,InputStream prefixesInputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
|
||||||
|
super(morphFormInputStream, decoderEncoder);
|
||||||
|
readPrefixes(prefixesInputStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readPrefixes(InputStream inputStream) throws IOException {
|
||||||
|
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
|
||||||
|
Integer prefixAmount = Integer.parseInt(bufferedReader.readLine());
|
||||||
|
for(int i = 0; i < prefixAmount;i++){
|
||||||
|
PrefixRule prefixRule = readPrefix(bufferedReader);
|
||||||
|
prefixRuleMap.put(prefixRule.getHashString(),prefixRule);
|
||||||
|
}
|
||||||
|
bufferedReader.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private PrefixRule readPrefix(BufferedReader bufferedReader) throws IOException {
|
||||||
|
PrefixRule prefixRule = new PrefixRule();
|
||||||
|
String s = bufferedReader.readLine();
|
||||||
|
prefixRule.setPrefix(s);
|
||||||
|
s = bufferedReader.readLine();
|
||||||
|
prefixRule.setLastLetter(s.charAt(0));
|
||||||
|
HashSet<Short> morph = new HashSet<Short>();
|
||||||
|
int formAmount = Integer.valueOf(bufferedReader.readLine());
|
||||||
|
for(int i = 0; i < formAmount; i++){
|
||||||
|
morph.add(Short.valueOf(bufferedReader.readLine()));
|
||||||
|
}
|
||||||
|
prefixRule.setForms(morph);
|
||||||
|
return prefixRule;
|
||||||
}
|
}
|
||||||
|
|
||||||
public MorphologyWithPrefix(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
|
public MorphologyWithPrefix(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
|
||||||
@ -25,7 +69,7 @@ public class MorphologyWithPrefix extends Morphology {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<String> getMorhInfo(String s) {
|
public List<String> getMorhInfo(String s) {
|
||||||
if (s.length() < 4) {
|
if (prefixRuleMap.size() == 0 || s.length() < 4) {
|
||||||
return super.getMorhInfo(s);
|
return super.getMorhInfo(s);
|
||||||
}
|
}
|
||||||
String ruleIndex = "" + s.charAt(0) + s.charAt(s.length() - 1);
|
String ruleIndex = "" + s.charAt(0) + s.charAt(s.length() - 1);
|
||||||
@ -33,7 +77,7 @@ public class MorphologyWithPrefix extends Morphology {
|
|||||||
if (prefixRule == null) {
|
if (prefixRule == null) {
|
||||||
return super.getMorhInfo(s);
|
return super.getMorhInfo(s);
|
||||||
}
|
}
|
||||||
if (s.startsWith(prefixRule.getPrefix())) {
|
if (!s.startsWith(prefixRule.getPrefix())) {
|
||||||
return super.getMorhInfo(s);
|
return super.getMorhInfo(s);
|
||||||
}
|
}
|
||||||
String sWithoutPrefix = s.substring(prefixRule.getPrefix().length());
|
String sWithoutPrefix = s.substring(prefixRule.getPrefix().length());
|
||||||
@ -42,8 +86,8 @@ public class MorphologyWithPrefix extends Morphology {
|
|||||||
int ruleId = findRuleId(ints);
|
int ruleId = findRuleId(ints);
|
||||||
ArrayList<String> result = new ArrayList<String>();
|
ArrayList<String> result = new ArrayList<String>();
|
||||||
for (Heuristic h : rules[rulesId[ruleId]]) {
|
for (Heuristic h : rules[rulesId[ruleId]]) {
|
||||||
String morphInfo = grammaInfo[h.getFormMorphInfo()];
|
//String morphInfo = grammaInfo[];
|
||||||
if(prefixRule.getForms().contains(morphInfo)){
|
if(prefixRule.getForms().contains(h.getFormMorphInfo())){
|
||||||
result.add(createForm(h.transofrmWord(sWithoutPrefix),"pr"));
|
result.add(createForm(h.transofrmWord(sWithoutPrefix),"pr"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,3 +1,18 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
package org.apache.lucene.morphology;
|
package org.apache.lucene.morphology;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
@ -7,7 +22,7 @@ import java.util.HashSet;
|
|||||||
public class PrefixRule implements Serializable {
|
public class PrefixRule implements Serializable {
|
||||||
private Character lastLetter;
|
private Character lastLetter;
|
||||||
private String prefix;
|
private String prefix;
|
||||||
private HashSet<String> forms;
|
private HashSet<Short> forms;
|
||||||
|
|
||||||
public Character getLastLetter() {
|
public Character getLastLetter() {
|
||||||
return lastLetter;
|
return lastLetter;
|
||||||
@ -25,11 +40,11 @@ public class PrefixRule implements Serializable {
|
|||||||
this.prefix = prefix;
|
this.prefix = prefix;
|
||||||
}
|
}
|
||||||
|
|
||||||
public HashSet<String> getForms() {
|
public HashSet<Short> getForms() {
|
||||||
return forms;
|
return forms;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setForms(HashSet<String> forms) {
|
public void setForms(HashSet<Short> forms) {
|
||||||
this.forms = forms;
|
this.forms = forms;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -22,6 +22,6 @@ import java.io.IOException;
|
|||||||
public class RussianLuceneMorphology extends LuceneMorphology {
|
public class RussianLuceneMorphology extends LuceneMorphology {
|
||||||
|
|
||||||
public RussianLuceneMorphology() throws IOException {
|
public RussianLuceneMorphology() throws IOException {
|
||||||
super(RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());
|
super(RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"),RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/prefixes.info"), new RussianLetterDecoderEncoder());
|
||||||
}
|
}
|
||||||
}
|
}
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,96 @@
|
|||||||
|
11
|
||||||
|
наи
|
||||||
|
е
|
||||||
|
8
|
||||||
|
258
|
||||||
|
255
|
||||||
|
289
|
||||||
|
252
|
||||||
|
292
|
||||||
|
262
|
||||||
|
296
|
||||||
|
286
|
||||||
|
наи
|
||||||
|
и
|
||||||
|
2
|
||||||
|
263
|
||||||
|
297
|
||||||
|
наи
|
||||||
|
ю
|
||||||
|
4
|
||||||
|
250
|
||||||
|
249
|
||||||
|
283
|
||||||
|
284
|
||||||
|
по
|
||||||
|
й
|
||||||
|
5
|
||||||
|
250
|
||||||
|
251
|
||||||
|
248
|
||||||
|
247
|
||||||
|
269
|
||||||
|
по
|
||||||
|
е
|
||||||
|
3
|
||||||
|
255
|
||||||
|
252
|
||||||
|
269
|
||||||
|
наи
|
||||||
|
й
|
||||||
|
12
|
||||||
|
239
|
||||||
|
273
|
||||||
|
250
|
||||||
|
251
|
||||||
|
248
|
||||||
|
277
|
||||||
|
247
|
||||||
|
282
|
||||||
|
281
|
||||||
|
243
|
||||||
|
285
|
||||||
|
284
|
||||||
|
наи
|
||||||
|
о
|
||||||
|
6
|
||||||
|
274
|
||||||
|
253
|
||||||
|
276
|
||||||
|
287
|
||||||
|
242
|
||||||
|
240
|
||||||
|
наи
|
||||||
|
м
|
||||||
|
10
|
||||||
|
256
|
||||||
|
290
|
||||||
|
257
|
||||||
|
291
|
||||||
|
279
|
||||||
|
278
|
||||||
|
294
|
||||||
|
260
|
||||||
|
244
|
||||||
|
245
|
||||||
|
наи
|
||||||
|
х
|
||||||
|
6
|
||||||
|
259
|
||||||
|
293
|
||||||
|
261
|
||||||
|
295
|
||||||
|
264
|
||||||
|
298
|
||||||
|
наи
|
||||||
|
я
|
||||||
|
2
|
||||||
|
246
|
||||||
|
280
|
||||||
|
наи
|
||||||
|
у
|
||||||
|
4
|
||||||
|
275
|
||||||
|
254
|
||||||
|
288
|
||||||
|
241
|
@ -33,7 +33,7 @@ public class RussianLuceneMorphTest {
|
|||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void setUp() throws IOException {
|
public void setUp() throws IOException {
|
||||||
luceneMorph = new LuceneMorphology(this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());
|
luceneMorph = new RussianLuceneMorphology();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
наилучший хороший
|
||||||
еду еда ехать
|
еду еда ехать
|
||||||
тестов тест
|
тестов тест
|
||||||
вина вино вина
|
вина вино вина
|
||||||
|
Loading…
x
Reference in New Issue
Block a user