adding test for all words, refactors test and dictonary reading
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@99 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
@ -34,16 +34,19 @@ public class DictionaryReader {
|
||||
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
|
||||
private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
|
||||
private Set<String> ignoredForm = new HashSet<String>();
|
||||
private List<WordFilter> filters = new ArrayList<WordFilter>();
|
||||
|
||||
public DictionaryReader(String fileName, Set<String> ignoredForm) {
|
||||
public DictionaryReader(String fileName, Set<String> ignoredForm, List<WordFilter> filters) {
|
||||
this.fileName = fileName;
|
||||
this.ignoredForm = ignoredForm;
|
||||
this.filters = filters;
|
||||
}
|
||||
|
||||
public DictionaryReader(String fileName, String fileEncoding, Set<String> ignoredForm) {
|
||||
public DictionaryReader(String fileName, String fileEncoding, Set<String> ignoredForm, List<WordFilter> filters) {
|
||||
this.fileName = fileName;
|
||||
this.fileEncoding = fileEncoding;
|
||||
this.ignoredForm = ignoredForm;
|
||||
this.filters = filters;
|
||||
}
|
||||
|
||||
|
||||
@ -60,30 +63,46 @@ public class DictionaryReader {
|
||||
private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.valueOf(s);
|
||||
int actual = 0;
|
||||
for (int i = 0; i < count; i++) {
|
||||
s = reader.readLine();
|
||||
if (i % 10000 == 0) System.out.println("Proccess " + i + " wordBase of " + count);
|
||||
|
||||
String[] wd = s.split(" ");
|
||||
String wordBase = wd[0].toLowerCase();
|
||||
if (wordBase.startsWith("-")) continue;
|
||||
wordBase = "#".equals(wordBase) ? "" : wordBase;
|
||||
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
|
||||
FlexiaModel flexiaModel = models.get(0);
|
||||
if (models.size() > 0 && !ignoredForm.contains(flexiaModel.getCode())) {
|
||||
|
||||
WordCard card = new WordCard(flexiaModel.create(wordBase), wordBase, flexiaModel.getSuffix());
|
||||
for (FlexiaModel fm : models) {
|
||||
card.addFlexia(fm);
|
||||
}
|
||||
// if(card.getBase().equals("face") || card.getBase().equals("fac")){
|
||||
// System.out.println(models);
|
||||
// System.out.println(card);
|
||||
wordProccessor.process(card);
|
||||
//}
|
||||
WordCard card = buildForm(s);
|
||||
|
||||
for (WordFilter wf : filters) {
|
||||
if (card == null) break;
|
||||
card = wf.transform(card);
|
||||
}
|
||||
|
||||
if (card == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
wordProccessor.process(card);
|
||||
actual++;
|
||||
|
||||
}
|
||||
System.out.println("Finished word processing actual words " + actual);
|
||||
}
|
||||
|
||||
private WordCard buildForm(String s) {
|
||||
String[] wd = s.split(" ");
|
||||
String wordBase = wd[0].toLowerCase();
|
||||
if (wordBase.startsWith("-")) return null;
|
||||
wordBase = "#".equals(wordBase) ? "" : wordBase;
|
||||
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
|
||||
FlexiaModel flexiaModel = models.get(0);
|
||||
if (models.size() == 0 || ignoredForm.contains(flexiaModel.getCode())) {
|
||||
return null;
|
||||
}
|
||||
|
||||
WordCard card = new WordCard(flexiaModel.create(wordBase), wordBase, flexiaModel.getSuffix());
|
||||
|
||||
for (FlexiaModel fm : models) {
|
||||
card.addFlexia(fm);
|
||||
}
|
||||
return card;
|
||||
}
|
||||
|
||||
|
||||
@ -122,7 +141,7 @@ public class DictionaryReader {
|
||||
String[] fl = line.split("\\*");
|
||||
// we inored all forms thats
|
||||
if (fl.length == 3) {
|
||||
System.out.println(line);
|
||||
//System.out.println(line);
|
||||
// flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase())));
|
||||
}
|
||||
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
|
||||
|
@ -66,4 +66,26 @@ public class FlexiaModel {
|
||||
", prefix='" + prefix + '\'' +
|
||||
'}';
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
FlexiaModel that = (FlexiaModel) o;
|
||||
|
||||
if (code != null ? !code.equals(that.code) : that.code != null) return false;
|
||||
if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false;
|
||||
if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = code != null ? code.hashCode() : 0;
|
||||
result = 31 * result + (suffix != null ? suffix.hashCode() : 0);
|
||||
result = 31 * result + (prefix != null ? prefix.hashCode() : 0);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
@ -42,12 +42,8 @@ public class StatisticsCollector implements WordProccessor {
|
||||
public void process(WordCard wordCard) throws IOException {
|
||||
cleanWordCard(wordCard);
|
||||
String normalStringMorph = wordCard.getWordsForms().get(0).getCode();
|
||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||
if (word.contains("-")) return;
|
||||
if (!decoderEncoder.checkString(word)) return;
|
||||
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) continue;
|
||||
Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph);
|
||||
String form = revertWord(fm.create(wordCard.getBase()));
|
||||
Set<Heuristic> suffixHeuristics = inverseIndex.get(form);
|
||||
@ -138,8 +134,8 @@ public class StatisticsCollector implements WordProccessor {
|
||||
Integer length = getCommonLength(form, normalForm);
|
||||
Integer actualSuffixLengh = form.length() - length;
|
||||
String actualNormalSuffix = normalForm.substring(length);
|
||||
Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode().substring(0, 2));
|
||||
Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm.substring(0, 2));
|
||||
Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode());
|
||||
Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm);
|
||||
return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue());
|
||||
}
|
||||
|
||||
|
@ -38,6 +38,10 @@ public class WordCard {
|
||||
wordsForms.add(flexiaModel);
|
||||
}
|
||||
|
||||
public void removeFlexia(FlexiaModel flexiaModel) {
|
||||
wordsForms.remove(flexiaModel);
|
||||
}
|
||||
|
||||
public String getCanonicalForm() {
|
||||
return canonicalForm;
|
||||
}
|
||||
|
@ -0,0 +1,50 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology.dictionary;
|
||||
|
||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class WordCleaner implements WordFilter {
|
||||
|
||||
private LetterDecoderEncoder decoderEncoder;
|
||||
|
||||
public WordCleaner(LetterDecoderEncoder decoderEncoder) {
|
||||
this.decoderEncoder = decoderEncoder;
|
||||
}
|
||||
|
||||
public WordCard transform(WordCard wordCard) {
|
||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||
|
||||
if (word.contains("-")) return null;
|
||||
if (!decoderEncoder.checkString(word)) return null;
|
||||
|
||||
List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>();
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) {
|
||||
flexiaModelsToRemove.add(fm);
|
||||
}
|
||||
}
|
||||
for (FlexiaModel fm : flexiaModelsToRemove) {
|
||||
wordCard.removeFlexia(fm);
|
||||
}
|
||||
|
||||
return wordCard;
|
||||
}
|
||||
}
|
@ -0,0 +1,24 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.morphology.dictionary;
|
||||
|
||||
|
||||
public interface WordFilter {
|
||||
|
||||
public WordCard transform(WordCard wordCard);
|
||||
|
||||
}
|
@ -0,0 +1,49 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology.dictionary;
|
||||
|
||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class WordStringCleaner implements WordFilter {
|
||||
|
||||
private LetterDecoderEncoder decoderEncoder;
|
||||
|
||||
public WordStringCleaner(LetterDecoderEncoder decoderEncoder) {
|
||||
this.decoderEncoder = decoderEncoder;
|
||||
}
|
||||
|
||||
public WordCard transform(WordCard wordCard) {
|
||||
wordCard.setBase(cleanString(wordCard.getBase()));
|
||||
wordCard.setCanonicalForm(cleanString(wordCard.getCanonicalForm()));
|
||||
wordCard.setCanonicalSuffix(cleanString(wordCard.getCanonicalSuffix()));
|
||||
List<FlexiaModel> models = wordCard.getWordsForms();
|
||||
for (FlexiaModel m : models) {
|
||||
m.setSuffix(cleanString(m.getSuffix()));
|
||||
m.setPrefix(cleanString(m.getPrefix()));
|
||||
//made correct code
|
||||
m.setCode(m.getCode().substring(0, 2));
|
||||
}
|
||||
return wordCard;
|
||||
}
|
||||
|
||||
|
||||
private String cleanString(String s) {
|
||||
return decoderEncoder.cleanString(s);
|
||||
}
|
||||
}
|
@ -16,22 +16,24 @@
|
||||
|
||||
package org.apache.lucene.morphology.generator;
|
||||
|
||||
import org.apache.lucene.morphology.dictionary.DictionaryReader;
|
||||
import org.apache.lucene.morphology.dictionary.GrammaReader;
|
||||
import org.apache.lucene.morphology.dictionary.StatisticsCollector;
|
||||
import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder;
|
||||
import org.apache.lucene.morphology.EnglishLetterDecoderEncoder;
|
||||
import org.apache.lucene.morphology.dictionary.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class EnglishHeuristicBuilder {
|
||||
public static void main(String[] args) throws IOException {
|
||||
|
||||
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/egramtab.tab");
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());
|
||||
|
||||
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
||||
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
||||
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>(), filters);
|
||||
|
||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
|
||||
dictionaryReader.proccess(statisticsCollector);
|
||||
statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
|
||||
|
@ -16,21 +16,23 @@
|
||||
|
||||
package org.apache.lucene.morphology.generator;
|
||||
|
||||
import org.apache.lucene.morphology.dictionary.DictionaryReader;
|
||||
import org.apache.lucene.morphology.dictionary.GrammaReader;
|
||||
import org.apache.lucene.morphology.dictionary.StatisticsCollector;
|
||||
import org.apache.lucene.morphology.dictionary.*;
|
||||
import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class RussianHeuristicBuilder {
|
||||
public static void main(String[] args) throws IOException {
|
||||
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
|
||||
|
||||
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
||||
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
||||
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>(), filters);
|
||||
|
||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
|
||||
dictionaryReader.proccess(statisticsCollector);
|
||||
statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");
|
||||
|
Reference in New Issue
Block a user