rallback on wrong version of morphology, adding interafce for morphology
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@88 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
@ -31,9 +31,9 @@ import java.util.*;
|
||||
public class DictonaryReader {
|
||||
private String fileName;
|
||||
private String fileEncoding = "windows-1251";
|
||||
protected List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
|
||||
protected List<List<String>> wordPrefixes = new ArrayList<List<String>>();
|
||||
protected Set<String> ingnoredForm = new HashSet<String>();
|
||||
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
|
||||
private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
|
||||
private Set<String> ingnoredForm = new HashSet<String>();
|
||||
|
||||
public DictonaryReader(String fileName, Set<String> ingnoredForm) {
|
||||
this.fileName = fileName;
|
||||
@ -57,7 +57,7 @@ public class DictonaryReader {
|
||||
}
|
||||
|
||||
|
||||
protected void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
|
||||
private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.valueOf(s);
|
||||
for (int i = 0; i < count; i++) {
|
||||
@ -81,7 +81,7 @@ public class DictonaryReader {
|
||||
}
|
||||
|
||||
|
||||
protected void sckipBlock(BufferedReader reader) throws IOException {
|
||||
private void sckipBlock(BufferedReader reader) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.valueOf(s);
|
||||
for (int i = 0; i < count; i++) {
|
||||
@ -90,7 +90,7 @@ public class DictonaryReader {
|
||||
}
|
||||
|
||||
|
||||
protected void readPrefix(BufferedReader reader) throws IOException {
|
||||
private void readPrefix(BufferedReader reader) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.valueOf(s);
|
||||
for (int i = 0; i < count; i++) {
|
||||
@ -99,7 +99,7 @@ public class DictonaryReader {
|
||||
}
|
||||
}
|
||||
|
||||
protected void readFlexias(BufferedReader reader) throws IOException {
|
||||
private void readFlexias(BufferedReader reader) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.valueOf(s);
|
||||
for (int i = 0; i < count; i++) {
|
||||
@ -112,7 +112,7 @@ public class DictonaryReader {
|
||||
}
|
||||
}
|
||||
|
||||
protected void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
|
||||
private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
|
||||
String[] fl = line.split("\\*");
|
||||
// we inored all forms thats
|
||||
if (fl.length == 3) {
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@ -60,28 +60,6 @@ public class FlexiaModel {
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return prefix + " " + suffix + " " + code;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
FlexiaModel that = (FlexiaModel) o;
|
||||
|
||||
if (code != null ? !code.equals(that.code) : that.code != null) return false;
|
||||
if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false;
|
||||
if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = code != null ? code.hashCode() : 0;
|
||||
result = 31 * result + (suffix != null ? suffix.hashCode() : 0);
|
||||
result = 31 * result + (prefix != null ? prefix.hashCode() : 0);
|
||||
return result;
|
||||
return prefix + " " + suffix;
|
||||
}
|
||||
}
|
||||
|
@ -1,139 +0,0 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology.dictionary;
|
||||
|
||||
import org.apache.lucene.morphology.PrefixRule;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
|
||||
public class PrefixesRulesBuilder extends DictonaryReader {
|
||||
private GrammaReader grammaInfo;
|
||||
|
||||
private Map<FlexiaModel,Set<FlexiaModel>> rules = new HashMap<FlexiaModel,Set<FlexiaModel>>();
|
||||
|
||||
public PrefixesRulesBuilder(String fileName, String fileEncoding, Set<String> ingnoredForm) throws IOException {
|
||||
super(fileName, fileEncoding, ingnoredForm);
|
||||
grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void proccess(WordProccessor wordProccessor) throws IOException {
|
||||
super.proccess(wordProccessor);
|
||||
System.out.println(rules.size());
|
||||
System.out.println(rules);
|
||||
}
|
||||
|
||||
public List<PrefixRule> getPrefixRules(){
|
||||
List<PrefixRule> prefixRules = new ArrayList<PrefixRule>();
|
||||
for(FlexiaModel key:rules.keySet()){
|
||||
PrefixRule prefixRule = new PrefixRule();
|
||||
prefixRule.setPrefix(key.getPrefix());
|
||||
prefixRule.setLastLetter(key.getSuffix().charAt(0));
|
||||
HashSet<Short> map = new HashSet<Short>();
|
||||
for(FlexiaModel fm:rules.get(key)){
|
||||
int gi = grammaInfo.getGrammInversIndex().get(fm.getCode());
|
||||
map.add((short) gi);
|
||||
}
|
||||
prefixRule.setForms(map);
|
||||
prefixRules.add(prefixRule);
|
||||
}
|
||||
return prefixRules;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
|
||||
sckipBlock(reader);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
protected void readPrefix(BufferedReader reader) throws IOException {
|
||||
sckipBlock(reader);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void readFlexias(BufferedReader reader) throws IOException {
|
||||
super.readFlexias(reader);
|
||||
//todo research flesias
|
||||
for(List<FlexiaModel> fmList:wordsFlexias){
|
||||
research(fmList);
|
||||
}
|
||||
}
|
||||
|
||||
private void research(List<FlexiaModel> models) {
|
||||
for(FlexiaModel fm:models){
|
||||
if(fm.getPrefix().length() > 0){
|
||||
testFlexia(models, fm);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void testFlexia(List<FlexiaModel> models, FlexiaModel fm) {
|
||||
for(FlexiaModel com:models){
|
||||
if(com.getSuffix().equals(fm.getSuffix()) && com.getPrefix().length() == 0){
|
||||
Set<FlexiaModel> models1 = rules.get(convertForKey(fm));
|
||||
if(models1 == null){
|
||||
models1 = new HashSet<FlexiaModel>();
|
||||
rules.put(convertForKey(fm),models1);
|
||||
}
|
||||
models1.add(convert(com));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private FlexiaModel convert(FlexiaModel fm){
|
||||
String suf = fm.getSuffix();
|
||||
//if(suf.length() == 1) System.out.println(fm);
|
||||
return new FlexiaModel(fm.getCode(),""+ suf.charAt(suf.length()-1),fm.getPrefix());
|
||||
}
|
||||
|
||||
private FlexiaModel convertForKey(FlexiaModel fm){
|
||||
String suf = fm.getSuffix();
|
||||
//if(suf.length() == 1) System.out.println(fm);
|
||||
return new FlexiaModel("pr",""+ suf.charAt(suf.length()-1),fm.getPrefix());
|
||||
}
|
||||
|
||||
protected void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
|
||||
String[] fl = line.split("\\*");
|
||||
if (fl.length == 3) {
|
||||
flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
|
||||
}
|
||||
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
|
||||
}
|
||||
|
||||
public void savePrefixes(String fileName) throws IOException {
|
||||
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8");
|
||||
List<PrefixRule> prefixRuleList = getPrefixRules();
|
||||
writer.write(prefixRuleList.size()+"\n");
|
||||
for(PrefixRule pr: prefixRuleList){
|
||||
writePrefixRule(writer, pr);
|
||||
}
|
||||
writer.close();
|
||||
}
|
||||
|
||||
private void writePrefixRule(OutputStreamWriter writer, PrefixRule pr) throws IOException {
|
||||
writer.write(pr.getPrefix()+"\n");
|
||||
writer.write(pr.getLastLetter()+"\n");
|
||||
HashSet<Short> formInfo = pr.getForms();
|
||||
writer.write(formInfo.size()+"\n");
|
||||
for(Short s:formInfo){
|
||||
writer.write(s+"\n");
|
||||
}
|
||||
}
|
||||
}
|
@ -19,7 +19,7 @@ package org.apache.lucene.morphology.dictionary;
|
||||
|
||||
import org.apache.lucene.morphology.Heuristic;
|
||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||
import org.apache.lucene.morphology.Morphology;
|
||||
import org.apache.lucene.morphology.MorphologyImpl;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
@ -119,7 +119,7 @@ public class StatiticsCollector implements WordProccessor {
|
||||
prevSet = currentSet;
|
||||
}
|
||||
}
|
||||
Morphology morphology = new Morphology(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray());
|
||||
MorphologyImpl morphology = new MorphologyImpl(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray());
|
||||
morphology.writeToFile(fileName);
|
||||
}
|
||||
|
||||
|
@ -1,39 +0,0 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.morphology.generator;
|
||||
|
||||
import org.apache.lucene.morphology.dictionary.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
|
||||
|
||||
public class RussianPrefixesBuilder {
|
||||
public static void main(String[] args) throws IOException {
|
||||
|
||||
PrefixesRulesBuilder dictonaryReader = new PrefixesRulesBuilder("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", "windows-1251",new HashSet<String>());
|
||||
|
||||
|
||||
dictonaryReader.proccess(new WordProccessor() {
|
||||
public void proccess(WordCard wordCard) throws IOException {
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
dictonaryReader.savePrefixes("russian/src/main/resources/org/apache/lucene/morphology/russian/prefixes.info");
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user