adding test for all words, refactors test and dictonary reading

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@99 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
Alexander.A.Kuznetsov
2010-10-08 11:35:13 +00:00
parent e8399999c3
commit 76e68a11e0
26 changed files with 730 additions and 48 deletions

View File

@ -34,16 +34,19 @@ public class DictionaryReader {
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
private Set<String> ignoredForm = new HashSet<String>();
private List<WordFilter> filters = new ArrayList<WordFilter>();
public DictionaryReader(String fileName, Set<String> ignoredForm) {
public DictionaryReader(String fileName, Set<String> ignoredForm, List<WordFilter> filters) {
this.fileName = fileName;
this.ignoredForm = ignoredForm;
this.filters = filters;
}
public DictionaryReader(String fileName, String fileEncoding, Set<String> ignoredForm) {
public DictionaryReader(String fileName, String fileEncoding, Set<String> ignoredForm, List<WordFilter> filters) {
this.fileName = fileName;
this.fileEncoding = fileEncoding;
this.ignoredForm = ignoredForm;
this.filters = filters;
}
@ -60,30 +63,46 @@ public class DictionaryReader {
private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
String s = reader.readLine();
int count = Integer.valueOf(s);
int actual = 0;
for (int i = 0; i < count; i++) {
s = reader.readLine();
if (i % 10000 == 0) System.out.println("Proccess " + i + " wordBase of " + count);
String[] wd = s.split(" ");
String wordBase = wd[0].toLowerCase();
if (wordBase.startsWith("-")) continue;
wordBase = "#".equals(wordBase) ? "" : wordBase;
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
FlexiaModel flexiaModel = models.get(0);
if (models.size() > 0 && !ignoredForm.contains(flexiaModel.getCode())) {
WordCard card = new WordCard(flexiaModel.create(wordBase), wordBase, flexiaModel.getSuffix());
for (FlexiaModel fm : models) {
card.addFlexia(fm);
}
// if(card.getBase().equals("face") || card.getBase().equals("fac")){
// System.out.println(models);
// System.out.println(card);
wordProccessor.process(card);
//}
WordCard card = buildForm(s);
for (WordFilter wf : filters) {
if (card == null) break;
card = wf.transform(card);
}
if (card == null) {
continue;
}
wordProccessor.process(card);
actual++;
}
System.out.println("Finished word processing actual words " + actual);
}
private WordCard buildForm(String s) {
String[] wd = s.split(" ");
String wordBase = wd[0].toLowerCase();
if (wordBase.startsWith("-")) return null;
wordBase = "#".equals(wordBase) ? "" : wordBase;
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
FlexiaModel flexiaModel = models.get(0);
if (models.size() == 0 || ignoredForm.contains(flexiaModel.getCode())) {
return null;
}
WordCard card = new WordCard(flexiaModel.create(wordBase), wordBase, flexiaModel.getSuffix());
for (FlexiaModel fm : models) {
card.addFlexia(fm);
}
return card;
}
@ -122,7 +141,7 @@ public class DictionaryReader {
String[] fl = line.split("\\*");
// we inored all forms thats
if (fl.length == 3) {
System.out.println(line);
//System.out.println(line);
// flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase())));
}
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));

View File

@ -66,4 +66,26 @@ public class FlexiaModel {
", prefix='" + prefix + '\'' +
'}';
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
FlexiaModel that = (FlexiaModel) o;
if (code != null ? !code.equals(that.code) : that.code != null) return false;
if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false;
if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false;
return true;
}
@Override
public int hashCode() {
int result = code != null ? code.hashCode() : 0;
result = 31 * result + (suffix != null ? suffix.hashCode() : 0);
result = 31 * result + (prefix != null ? prefix.hashCode() : 0);
return result;
}
}

View File

@ -42,12 +42,8 @@ public class StatisticsCollector implements WordProccessor {
public void process(WordCard wordCard) throws IOException {
cleanWordCard(wordCard);
String normalStringMorph = wordCard.getWordsForms().get(0).getCode();
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
if (word.contains("-")) return;
if (!decoderEncoder.checkString(word)) return;
for (FlexiaModel fm : wordCard.getWordsForms()) {
if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) continue;
Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph);
String form = revertWord(fm.create(wordCard.getBase()));
Set<Heuristic> suffixHeuristics = inverseIndex.get(form);
@ -138,8 +134,8 @@ public class StatisticsCollector implements WordProccessor {
Integer length = getCommonLength(form, normalForm);
Integer actualSuffixLengh = form.length() - length;
String actualNormalSuffix = normalForm.substring(length);
Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode().substring(0, 2));
Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm.substring(0, 2));
Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode());
Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm);
return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue());
}

View File

@ -38,6 +38,10 @@ public class WordCard {
wordsForms.add(flexiaModel);
}
public void removeFlexia(FlexiaModel flexiaModel) {
wordsForms.remove(flexiaModel);
}
public String getCanonicalForm() {
return canonicalForm;
}

View File

@ -0,0 +1,50 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.dictionary;
import org.apache.lucene.morphology.LetterDecoderEncoder;
import java.util.LinkedList;
import java.util.List;
public class WordCleaner implements WordFilter {
private LetterDecoderEncoder decoderEncoder;
public WordCleaner(LetterDecoderEncoder decoderEncoder) {
this.decoderEncoder = decoderEncoder;
}
public WordCard transform(WordCard wordCard) {
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
if (word.contains("-")) return null;
if (!decoderEncoder.checkString(word)) return null;
List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>();
for (FlexiaModel fm : wordCard.getWordsForms()) {
if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) {
flexiaModelsToRemove.add(fm);
}
}
for (FlexiaModel fm : flexiaModelsToRemove) {
wordCard.removeFlexia(fm);
}
return wordCard;
}
}

View File

@ -0,0 +1,24 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.dictionary;
public interface WordFilter {
public WordCard transform(WordCard wordCard);
}

View File

@ -0,0 +1,49 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.dictionary;
import org.apache.lucene.morphology.LetterDecoderEncoder;
import java.util.List;
public class WordStringCleaner implements WordFilter {
private LetterDecoderEncoder decoderEncoder;
public WordStringCleaner(LetterDecoderEncoder decoderEncoder) {
this.decoderEncoder = decoderEncoder;
}
public WordCard transform(WordCard wordCard) {
wordCard.setBase(cleanString(wordCard.getBase()));
wordCard.setCanonicalForm(cleanString(wordCard.getCanonicalForm()));
wordCard.setCanonicalSuffix(cleanString(wordCard.getCanonicalSuffix()));
List<FlexiaModel> models = wordCard.getWordsForms();
for (FlexiaModel m : models) {
m.setSuffix(cleanString(m.getSuffix()));
m.setPrefix(cleanString(m.getPrefix()));
//made correct code
m.setCode(m.getCode().substring(0, 2));
}
return wordCard;
}
private String cleanString(String s) {
return decoderEncoder.cleanString(s);
}
}

View File

@ -16,22 +16,24 @@
package org.apache.lucene.morphology.generator;
import org.apache.lucene.morphology.dictionary.DictionaryReader;
import org.apache.lucene.morphology.dictionary.GrammaReader;
import org.apache.lucene.morphology.dictionary.StatisticsCollector;
import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder;
import org.apache.lucene.morphology.EnglishLetterDecoderEncoder;
import org.apache.lucene.morphology.dictionary.*;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
public class EnglishHeuristicBuilder {
public static void main(String[] args) throws IOException {
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/egramtab.tab");
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>(), filters);
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
dictionaryReader.proccess(statisticsCollector);
statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");

View File

@ -16,21 +16,23 @@
package org.apache.lucene.morphology.generator;
import org.apache.lucene.morphology.dictionary.DictionaryReader;
import org.apache.lucene.morphology.dictionary.GrammaReader;
import org.apache.lucene.morphology.dictionary.StatisticsCollector;
import org.apache.lucene.morphology.dictionary.*;
import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
public class RussianHeuristicBuilder {
public static void main(String[] args) throws IOException {
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>(), filters);
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
dictionaryReader.proccess(statisticsCollector);
statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");