working on prefixes

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@85 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov 2009-11-11 07:10:46 +00:00
parent e478d86fe0
commit 97fa8fa868
5 changed files with 158 additions and 11 deletions

View File

@ -31,9 +31,9 @@ import java.util.*;
public class DictonaryReader {
private String fileName;
private String fileEncoding = "windows-1251";
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
private Set<String> ingnoredForm = new HashSet<String>();
protected List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
protected List<List<String>> wordPrefixes = new ArrayList<List<String>>();
protected Set<String> ingnoredForm = new HashSet<String>();
public DictonaryReader(String fileName, Set<String> ingnoredForm) {
this.fileName = fileName;
@ -57,7 +57,7 @@ public class DictonaryReader {
}
private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
protected void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
String s = reader.readLine();
int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) {
@ -81,7 +81,7 @@ public class DictonaryReader {
}
private void sckipBlock(BufferedReader reader) throws IOException {
protected void sckipBlock(BufferedReader reader) throws IOException {
String s = reader.readLine();
int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) {
@ -90,7 +90,7 @@ public class DictonaryReader {
}
private void readPrefix(BufferedReader reader) throws IOException {
protected void readPrefix(BufferedReader reader) throws IOException {
String s = reader.readLine();
int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) {
@ -99,7 +99,7 @@ public class DictonaryReader {
}
}
private void readFlexias(BufferedReader reader) throws IOException {
protected void readFlexias(BufferedReader reader) throws IOException {
String s = reader.readLine();
int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) {
@ -112,7 +112,7 @@ public class DictonaryReader {
}
}
private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
protected void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
String[] fl = line.split("\\*");
// we inored all forms thats
if (fl.length == 3) {

View File

@ -1,5 +1,5 @@
/**
* Copyright 2009 Alexander Kuznetsov
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -60,6 +60,28 @@ public class FlexiaModel {
@Override
public String toString() {
return prefix + " " + suffix;
return prefix + " " + suffix + " " + code;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
FlexiaModel that = (FlexiaModel) o;
if (code != null ? !code.equals(that.code) : that.code != null) return false;
if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false;
if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false;
return true;
}
@Override
public int hashCode() {
int result = code != null ? code.hashCode() : 0;
result = 31 * result + (suffix != null ? suffix.hashCode() : 0);
result = 31 * result + (prefix != null ? prefix.hashCode() : 0);
return result;
}
}

View File

@ -0,0 +1,82 @@
package org.apache.lucene.morphology.dictionary;
import java.util.*;
import java.io.IOException;
import java.io.BufferedReader;
public class PrefixesHypotises extends DictonaryReader {
private Map<FlexiaModel,Set<FlexiaModel>> rules = new HashMap<FlexiaModel,Set<FlexiaModel>>();
public PrefixesHypotises(String fileName, Set<String> ingnoredForm) {
super(fileName, ingnoredForm);
}
public PrefixesHypotises(String fileName, String fileEncoding, Set<String> ingnoredForm) {
super(fileName, fileEncoding, ingnoredForm);
}
@Override
public void proccess(WordProccessor wordProccessor) throws IOException {
super.proccess(wordProccessor);
System.out.println(rules.size());
System.out.println(rules);
}
@Override
protected void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
sckipBlock(reader);
}
@Override
protected void readPrefix(BufferedReader reader) throws IOException {
sckipBlock(reader);
}
@Override
protected void readFlexias(BufferedReader reader) throws IOException {
super.readFlexias(reader);
//todo research flesias
for(List<FlexiaModel> fmList:wordsFlexias){
research(fmList);
}
}
private void research(List<FlexiaModel> models) {
for(FlexiaModel fm:models){
if(fm.getPrefix().length() > 0){
testFlexia(models, fm);
}
}
}
private void testFlexia(List<FlexiaModel> models, FlexiaModel fm) {
for(FlexiaModel com:models){
if(com.getSuffix().equals(fm.getSuffix()) && com.getPrefix().length() == 0){
Set<FlexiaModel> models1 = rules.get(convert(fm));
if(models1 == null){
models1 = new HashSet<FlexiaModel>();
rules.put(convert(fm),models1);
}
models1.add(convert(com));
}
}
}
private FlexiaModel convert(FlexiaModel fm){
String suf = fm.getSuffix();
if(suf.length() == 1) System.out.println(fm);
return new FlexiaModel(fm.getCode(),""+ suf.charAt(suf.length()-1)+ (suf.length() > 1 ? suf.charAt(suf.length()-2) : ""),fm.getPrefix());
}
protected void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
String[] fl = line.split("\\*");
if (fl.length == 3) {
flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
}
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
}
}

View File

@ -0,0 +1,41 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.generator;
import org.apache.lucene.morphology.dictionary.*;
import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
import java.io.IOException;
import java.util.HashSet;
public class RussianPrefixesBuilder {
public static void main(String[] args) throws IOException {
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
PrefixesHypotises dictonaryReader = new PrefixesHypotises("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
//RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
//StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo, decoderEncoder);
dictonaryReader.proccess(new WordProccessor(){
public void proccess(WordCard wordCard) throws IOException {
//To change body of implemented methods use File | Settings | File Templates.
}
});
//statiticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");
}
}

View File

@ -16,4 +16,6 @@
тосклив тоскливый
лучший хороший
на на
тест тест тесто
тест тест тесто
спам спам
спама спам