adding test for all words, refactors test and dictonary reading
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@99 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
e8399999c3
commit
76e68a11e0
@ -1,5 +1,6 @@
|
||||
<?xml version="1.0"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<parent>
|
||||
<artifactId>morphology</artifactId>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
@ -26,4 +27,6 @@
|
||||
<version>0.9-SNAPSHOT</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
||||
</project>
|
||||
|
@ -34,16 +34,19 @@ public class DictionaryReader {
|
||||
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
|
||||
private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
|
||||
private Set<String> ignoredForm = new HashSet<String>();
|
||||
private List<WordFilter> filters = new ArrayList<WordFilter>();
|
||||
|
||||
public DictionaryReader(String fileName, Set<String> ignoredForm) {
|
||||
public DictionaryReader(String fileName, Set<String> ignoredForm, List<WordFilter> filters) {
|
||||
this.fileName = fileName;
|
||||
this.ignoredForm = ignoredForm;
|
||||
this.filters = filters;
|
||||
}
|
||||
|
||||
public DictionaryReader(String fileName, String fileEncoding, Set<String> ignoredForm) {
|
||||
public DictionaryReader(String fileName, String fileEncoding, Set<String> ignoredForm, List<WordFilter> filters) {
|
||||
this.fileName = fileName;
|
||||
this.fileEncoding = fileEncoding;
|
||||
this.ignoredForm = ignoredForm;
|
||||
this.filters = filters;
|
||||
}
|
||||
|
||||
|
||||
@ -60,30 +63,46 @@ public class DictionaryReader {
|
||||
private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.valueOf(s);
|
||||
int actual = 0;
|
||||
for (int i = 0; i < count; i++) {
|
||||
s = reader.readLine();
|
||||
if (i % 10000 == 0) System.out.println("Proccess " + i + " wordBase of " + count);
|
||||
|
||||
WordCard card = buildForm(s);
|
||||
|
||||
for (WordFilter wf : filters) {
|
||||
if (card == null) break;
|
||||
card = wf.transform(card);
|
||||
}
|
||||
|
||||
if (card == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
wordProccessor.process(card);
|
||||
actual++;
|
||||
|
||||
}
|
||||
System.out.println("Finished word processing actual words " + actual);
|
||||
}
|
||||
|
||||
private WordCard buildForm(String s) {
|
||||
String[] wd = s.split(" ");
|
||||
String wordBase = wd[0].toLowerCase();
|
||||
if (wordBase.startsWith("-")) continue;
|
||||
if (wordBase.startsWith("-")) return null;
|
||||
wordBase = "#".equals(wordBase) ? "" : wordBase;
|
||||
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
|
||||
FlexiaModel flexiaModel = models.get(0);
|
||||
if (models.size() > 0 && !ignoredForm.contains(flexiaModel.getCode())) {
|
||||
if (models.size() == 0 || ignoredForm.contains(flexiaModel.getCode())) {
|
||||
return null;
|
||||
}
|
||||
|
||||
WordCard card = new WordCard(flexiaModel.create(wordBase), wordBase, flexiaModel.getSuffix());
|
||||
|
||||
for (FlexiaModel fm : models) {
|
||||
card.addFlexia(fm);
|
||||
}
|
||||
// if(card.getBase().equals("face") || card.getBase().equals("fac")){
|
||||
// System.out.println(models);
|
||||
// System.out.println(card);
|
||||
wordProccessor.process(card);
|
||||
//}
|
||||
|
||||
}
|
||||
}
|
||||
return card;
|
||||
}
|
||||
|
||||
|
||||
@ -122,7 +141,7 @@ public class DictionaryReader {
|
||||
String[] fl = line.split("\\*");
|
||||
// we inored all forms thats
|
||||
if (fl.length == 3) {
|
||||
System.out.println(line);
|
||||
//System.out.println(line);
|
||||
// flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase())));
|
||||
}
|
||||
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
|
||||
|
@ -66,4 +66,26 @@ public class FlexiaModel {
|
||||
", prefix='" + prefix + '\'' +
|
||||
'}';
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
FlexiaModel that = (FlexiaModel) o;
|
||||
|
||||
if (code != null ? !code.equals(that.code) : that.code != null) return false;
|
||||
if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false;
|
||||
if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = code != null ? code.hashCode() : 0;
|
||||
result = 31 * result + (suffix != null ? suffix.hashCode() : 0);
|
||||
result = 31 * result + (prefix != null ? prefix.hashCode() : 0);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
@ -42,12 +42,8 @@ public class StatisticsCollector implements WordProccessor {
|
||||
public void process(WordCard wordCard) throws IOException {
|
||||
cleanWordCard(wordCard);
|
||||
String normalStringMorph = wordCard.getWordsForms().get(0).getCode();
|
||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||
if (word.contains("-")) return;
|
||||
if (!decoderEncoder.checkString(word)) return;
|
||||
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) continue;
|
||||
Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph);
|
||||
String form = revertWord(fm.create(wordCard.getBase()));
|
||||
Set<Heuristic> suffixHeuristics = inverseIndex.get(form);
|
||||
@ -138,8 +134,8 @@ public class StatisticsCollector implements WordProccessor {
|
||||
Integer length = getCommonLength(form, normalForm);
|
||||
Integer actualSuffixLengh = form.length() - length;
|
||||
String actualNormalSuffix = normalForm.substring(length);
|
||||
Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode().substring(0, 2));
|
||||
Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm.substring(0, 2));
|
||||
Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode());
|
||||
Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm);
|
||||
return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue());
|
||||
}
|
||||
|
||||
|
@ -38,6 +38,10 @@ public class WordCard {
|
||||
wordsForms.add(flexiaModel);
|
||||
}
|
||||
|
||||
public void removeFlexia(FlexiaModel flexiaModel) {
|
||||
wordsForms.remove(flexiaModel);
|
||||
}
|
||||
|
||||
public String getCanonicalForm() {
|
||||
return canonicalForm;
|
||||
}
|
||||
|
@ -0,0 +1,50 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology.dictionary;
|
||||
|
||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class WordCleaner implements WordFilter {
|
||||
|
||||
private LetterDecoderEncoder decoderEncoder;
|
||||
|
||||
public WordCleaner(LetterDecoderEncoder decoderEncoder) {
|
||||
this.decoderEncoder = decoderEncoder;
|
||||
}
|
||||
|
||||
public WordCard transform(WordCard wordCard) {
|
||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||
|
||||
if (word.contains("-")) return null;
|
||||
if (!decoderEncoder.checkString(word)) return null;
|
||||
|
||||
List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>();
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) {
|
||||
flexiaModelsToRemove.add(fm);
|
||||
}
|
||||
}
|
||||
for (FlexiaModel fm : flexiaModelsToRemove) {
|
||||
wordCard.removeFlexia(fm);
|
||||
}
|
||||
|
||||
return wordCard;
|
||||
}
|
||||
}
|
@ -0,0 +1,24 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.morphology.dictionary;
|
||||
|
||||
|
||||
public interface WordFilter {
|
||||
|
||||
public WordCard transform(WordCard wordCard);
|
||||
|
||||
}
|
@ -0,0 +1,49 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology.dictionary;
|
||||
|
||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class WordStringCleaner implements WordFilter {
|
||||
|
||||
private LetterDecoderEncoder decoderEncoder;
|
||||
|
||||
public WordStringCleaner(LetterDecoderEncoder decoderEncoder) {
|
||||
this.decoderEncoder = decoderEncoder;
|
||||
}
|
||||
|
||||
public WordCard transform(WordCard wordCard) {
|
||||
wordCard.setBase(cleanString(wordCard.getBase()));
|
||||
wordCard.setCanonicalForm(cleanString(wordCard.getCanonicalForm()));
|
||||
wordCard.setCanonicalSuffix(cleanString(wordCard.getCanonicalSuffix()));
|
||||
List<FlexiaModel> models = wordCard.getWordsForms();
|
||||
for (FlexiaModel m : models) {
|
||||
m.setSuffix(cleanString(m.getSuffix()));
|
||||
m.setPrefix(cleanString(m.getPrefix()));
|
||||
//made correct code
|
||||
m.setCode(m.getCode().substring(0, 2));
|
||||
}
|
||||
return wordCard;
|
||||
}
|
||||
|
||||
|
||||
private String cleanString(String s) {
|
||||
return decoderEncoder.cleanString(s);
|
||||
}
|
||||
}
|
@ -16,22 +16,24 @@
|
||||
|
||||
package org.apache.lucene.morphology.generator;
|
||||
|
||||
import org.apache.lucene.morphology.dictionary.DictionaryReader;
|
||||
import org.apache.lucene.morphology.dictionary.GrammaReader;
|
||||
import org.apache.lucene.morphology.dictionary.StatisticsCollector;
|
||||
import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder;
|
||||
import org.apache.lucene.morphology.EnglishLetterDecoderEncoder;
|
||||
import org.apache.lucene.morphology.dictionary.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class EnglishHeuristicBuilder {
|
||||
public static void main(String[] args) throws IOException {
|
||||
|
||||
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/egramtab.tab");
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());
|
||||
|
||||
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
||||
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
||||
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>(), filters);
|
||||
|
||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
|
||||
dictionaryReader.proccess(statisticsCollector);
|
||||
statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
|
||||
|
@ -16,21 +16,23 @@
|
||||
|
||||
package org.apache.lucene.morphology.generator;
|
||||
|
||||
import org.apache.lucene.morphology.dictionary.DictionaryReader;
|
||||
import org.apache.lucene.morphology.dictionary.GrammaReader;
|
||||
import org.apache.lucene.morphology.dictionary.StatisticsCollector;
|
||||
import org.apache.lucene.morphology.dictionary.*;
|
||||
import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class RussianHeuristicBuilder {
|
||||
public static void main(String[] args) throws IOException {
|
||||
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
|
||||
|
||||
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
||||
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
||||
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>(), filters);
|
||||
|
||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
|
||||
dictionaryReader.proccess(statisticsCollector);
|
||||
statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");
|
||||
|
@ -0,0 +1,144 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene;
|
||||
|
||||
import org.apache.lucene.morphology.*;
|
||||
import org.apache.lucene.morphology.dictionary.*;
|
||||
import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
|
||||
import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
|
||||
import org.apache.lucene.morphology.russian.RussianMorphology;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
import static org.hamcrest.Matchers.hasItem;
|
||||
import static org.junit.Assert.assertThat;
|
||||
|
||||
|
||||
public class TestAllWords {
|
||||
|
||||
String prefix = "";
|
||||
|
||||
@Before
|
||||
public void setUp() {
|
||||
System.out.println(System.getProperty("user.dir"));
|
||||
prefix = System.getProperty("user.dir").endsWith("dictionary-reader") ? "../" : "";
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldEnglishMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException {
|
||||
final Morphology morphology = new EnglishMorphology();
|
||||
LetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
||||
String pathToGramma = prefix + "dictonary/Dicts/Morph/egramtab.tab";
|
||||
String pathToDict = prefix + "dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd";
|
||||
|
||||
testFullGramma(morphology, decoderEncoder, pathToGramma, pathToDict);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldRussianMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException {
|
||||
final Morphology morphology = new RussianMorphology();
|
||||
LetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
||||
String pathToGramma = prefix + "dictonary/Dicts/Morph/rgramtab.tab";
|
||||
String pathToDict = prefix + "dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd";
|
||||
|
||||
testFullGramma(morphology, decoderEncoder, pathToGramma, pathToDict);
|
||||
}
|
||||
|
||||
private void testFullGramma(final Morphology morphology, LetterDecoderEncoder decoderEncoder, String pathToGramma, String pathToDict) throws IOException {
|
||||
GrammaReader grammaInfo = new GrammaReader(pathToGramma);
|
||||
final List<String> morphInfo = grammaInfo.getGrammaInfo();
|
||||
final Map<String, Integer> inversIndex = grammaInfo.getGrammInversIndex();
|
||||
|
||||
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
||||
|
||||
|
||||
DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet<String>(), filters);
|
||||
|
||||
final AtomicLong wordCount = new AtomicLong(0);
|
||||
Long startTime = System.currentTimeMillis();
|
||||
|
||||
dictionaryReader.proccess(new WordProccessor() {
|
||||
public void process(WordCard wordCard) throws IOException {
|
||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
String wordForm = wordCard.getBase() + fm.getSuffix();
|
||||
String morph = morphInfo.get(inversIndex.get(fm.getCode()));
|
||||
assertThat(morphology.getMorphInfo(wordForm), hasItem(word + "|" + morph));
|
||||
assertThat(morphology.getNormalForms(wordForm), hasItem(word));
|
||||
wordCount.set(2L + wordCount.get());
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
long time = System.currentTimeMillis() - startTime;
|
||||
System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldEnglishLuceneMorphologyIncludeAllWords() throws IOException {
|
||||
final LuceneMorphology morphology = new EnglishLuceneMorphology();
|
||||
|
||||
LetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
||||
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
||||
String pathToDic = prefix + "dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd";
|
||||
|
||||
testAllWordForLucene(morphology, filters, pathToDic);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldIncludeAllWordsRussianInLuceneMorophology() throws IOException {
|
||||
final LuceneMorphology morphology = new RussianLuceneMorphology();
|
||||
|
||||
LetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
||||
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
||||
|
||||
String pathToDic = prefix + "dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd";
|
||||
|
||||
testAllWordForLucene(morphology, filters, pathToDic);
|
||||
|
||||
}
|
||||
|
||||
private void testAllWordForLucene(final LuceneMorphology morphology, List<WordFilter> filters, String pathToDic) throws IOException {
|
||||
final AtomicLong wordCount = new AtomicLong(0);
|
||||
Long startTime = System.currentTimeMillis();
|
||||
|
||||
DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<String>(), filters);
|
||||
dictionaryReader.proccess(new WordProccessor() {
|
||||
public void process(WordCard wordCard) throws IOException {
|
||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
String wordForm = wordCard.getBase() + fm.getSuffix();
|
||||
assertThat(morphology.getNormalForms(wordForm), hasItem(word));
|
||||
wordCount.set(1L + wordCount.get());
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
long time = System.currentTimeMillis() - startTime;
|
||||
System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second");
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,77 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.morphology.russian.RussianAnalyzer;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.junit.Assert.assertThat;
|
||||
|
||||
|
||||
public class AnalayzersTest {
|
||||
|
||||
@Test
|
||||
public void englishAnalyzerShouldGiveCorrectWords() throws IOException {
|
||||
Analyzer morphlogyAnalyzer = new EnglishAnalyzer();
|
||||
String answerPath = "/english/englsih-analayzer-answer.txt";
|
||||
String testPath = "/english/englsih-analayzer-data.txt";
|
||||
|
||||
testAnalayzer(morphlogyAnalyzer, answerPath, testPath);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shoudGiveCorretWords() throws IOException {
|
||||
Analyzer morphlogyAnalyzer = new RussianAnalyzer();
|
||||
String answerPath = "/russian/russian-analayzer-answer.txt";
|
||||
String testPath = "/russian/russian-analayzer-data.txt";
|
||||
|
||||
testAnalayzer(morphlogyAnalyzer, answerPath, testPath);
|
||||
}
|
||||
|
||||
private void testAnalayzer(Analyzer morphlogyAnalyzer, String answerPath, String testPath) throws IOException {
|
||||
InputStream stream = this.getClass().getResourceAsStream(answerPath);
|
||||
BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||
String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
|
||||
HashSet<String> answer = new HashSet<String>(Arrays.asList(strings));
|
||||
stream.close();
|
||||
|
||||
stream = this.getClass().getResourceAsStream(testPath);
|
||||
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
|
||||
TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
while (tokenStream.incrementToken()) {
|
||||
TermAttribute attribute1 = tokenStream.getAttribute(TermAttribute.class);
|
||||
result.add(attribute1.term());
|
||||
}
|
||||
|
||||
stream.close();
|
||||
|
||||
assertThat(result, equalTo(answer));
|
||||
}
|
||||
}
|
@ -0,0 +1,62 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology;
|
||||
|
||||
import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import static org.hamcrest.CoreMatchers.equalTo;
|
||||
import static org.junit.Assert.assertThat;
|
||||
|
||||
|
||||
public class LuceneMorphTest {
|
||||
|
||||
@Test
|
||||
public void englishMorphologyShouldGetCorrectNormalForm() throws IOException {
|
||||
LuceneMorphology luceneMorph = new EnglishLuceneMorphology();
|
||||
String pathToTestData = "/english/english-morphology-test.txt";
|
||||
testMorphology(luceneMorph, pathToTestData);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void russianMorphologyShouldGetCorrectNormalForm() throws IOException {
|
||||
LuceneMorphology luceneMorph = new RussianLuceneMorphology();
|
||||
String pathToTestData = "/russian/russian-morphology-test.txt";
|
||||
testMorphology(luceneMorph, pathToTestData);
|
||||
}
|
||||
|
||||
private void testMorphology(LuceneMorphology luceneMorph, String pathToTestData) throws IOException {
|
||||
InputStream stream = this.getClass().getResourceAsStream(pathToTestData);
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||
String s = bufferedReader.readLine();
|
||||
while (s != null) {
|
||||
String[] qa = s.trim().split(" ");
|
||||
Set<String> result = new HashSet<String>();
|
||||
result.addAll(Arrays.asList(qa).subList(1, qa.length));
|
||||
Set<String> stringList = new HashSet<String>(luceneMorph.getNormalForms(qa[0]));
|
||||
assertThat(stringList, equalTo(result));
|
||||
s = bufferedReader.readLine();
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,8 @@
|
||||
purchases purchase
|
||||
existing exist
|
||||
was be
|
||||
men man
|
||||
bore bore bear
|
||||
grown grow grown
|
||||
came come
|
||||
md md
|
@ -0,0 +1 @@
|
||||
following follow the instruction exactly will be help ensure the best well good result
|
@ -0,0 +1 @@
|
||||
Following the instructions exactly will help ensure the best results
|
@ -0,0 +1 @@
|
||||
в результат крушение погибнуть командир отряд специальный назначение пря при переть гувд ростовский область полковник милиция михаил перов и предприниматель
|
@ -0,0 +1 @@
|
||||
В результате крушения погибли командир отряда специального назначения при ГУВД Ростовской области полковник милиции Михаил Перов и предприниматель
|
@ -0,0 +1,19 @@
|
||||
еду еда ехать
|
||||
тестов тест
|
||||
вина вино вина
|
||||
вино вино
|
||||
ехать ехать
|
||||
ананасов ананас ананасовый
|
||||
сухой сухой
|
||||
дураков дурак
|
||||
пушка пушка пушок
|
||||
пушок пушок
|
||||
пушек пушка
|
||||
козлов козлов козловый козел
|
||||
жуков жуков жук
|
||||
красив красить красивый
|
||||
красивая красивый
|
||||
тосклив тоскливый
|
||||
лучший хороший
|
||||
на на
|
||||
тест тест тесто
|
@ -1,5 +1,6 @@
|
||||
<?xml version="1.0"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<parent>
|
||||
<artifactId>morphology</artifactId>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
@ -12,6 +13,7 @@
|
||||
<version>0.9-SNAPSHOT</version>
|
||||
<url>http://maven.apache.org</url>
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>morph</artifactId>
|
||||
|
@ -0,0 +1,29 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology;
|
||||
|
||||
import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
public class EnglishAnalyzer extends MorphologyAnalyzer {
|
||||
|
||||
public EnglishAnalyzer() throws IOException {
|
||||
super(new EnglishLuceneMorphology());
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,111 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
|
||||
public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder {
|
||||
public static final int ENGLISH_SMALL_LETTER_OFFSET = 96;
|
||||
static public int SUFFIX_LENGTH = 6;
|
||||
public static final int DASH_CHAR = 45;
|
||||
public static final int DASH_CODE = 27;
|
||||
|
||||
public Integer encode(String string) {
|
||||
if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12);
|
||||
int result = 0;
|
||||
for (int i = 0; i < string.length(); i++) {
|
||||
int c = 0 + string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET;
|
||||
if (c == 45 - ENGLISH_SMALL_LETTER_OFFSET) {
|
||||
c = DASH_CODE;
|
||||
}
|
||||
if (c < 0 || c > 27)
|
||||
throw new WrongCharaterException("Symblo " + string.charAt(i) + " is not small cirillic letter");
|
||||
result = result * 28 + c;
|
||||
}
|
||||
for (int i = string.length(); i < 6; i++) {
|
||||
result *= 28;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public int[] encodeToArray(String s) {
|
||||
|
||||
ArrayList<Integer> integers = new ArrayList<Integer>();
|
||||
while (s.length() > 6) {
|
||||
integers.add(encode(s.substring(0, 6)));
|
||||
s = s.substring(6);
|
||||
}
|
||||
integers.add(encode(s));
|
||||
int[] ints = new int[integers.size()];
|
||||
int pos = 0;
|
||||
for (Integer i : integers) {
|
||||
ints[pos] = i;
|
||||
pos++;
|
||||
}
|
||||
return ints;
|
||||
}
|
||||
|
||||
public String decodeArray(int[] array) {
|
||||
String result = "";
|
||||
for (int i : array) {
|
||||
result += decode(i);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
public String decode(Integer suffixN) {
|
||||
String result = "";
|
||||
while (suffixN > 27) {
|
||||
int c = suffixN % 28 + ENGLISH_SMALL_LETTER_OFFSET;
|
||||
if (c == ENGLISH_SMALL_LETTER_OFFSET) {
|
||||
suffixN /= 28;
|
||||
continue;
|
||||
}
|
||||
if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
|
||||
result = (char) c + result;
|
||||
suffixN /= 28;
|
||||
}
|
||||
long c = suffixN + ENGLISH_SMALL_LETTER_OFFSET;
|
||||
if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
|
||||
result = (char) c + result;
|
||||
return result;
|
||||
}
|
||||
|
||||
public boolean checkCharacter(char c) {
|
||||
int code = 0 + c;
|
||||
if (code == 45) return true;
|
||||
code -= ENGLISH_SMALL_LETTER_OFFSET;
|
||||
if (code > 0 && code < 27) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public boolean checkString(String word) {
|
||||
for (int i = 0; i < word.length(); i++) {
|
||||
if (!checkCharacter(word.charAt(i))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public String cleanString(String s) {
|
||||
return s;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
public class EnglishLuceneMorphology extends LuceneMorphology {
|
||||
|
||||
public EnglishLuceneMorphology() throws IOException {
|
||||
super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
|
||||
}
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
public class EnglishMorphology extends MorphologyImpl {
|
||||
|
||||
public EnglishMorphology() throws IOException {
|
||||
super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
|
||||
}
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
<?xml version="1.0"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<parent>
|
||||
<artifactId>morphology</artifactId>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
@ -13,6 +14,7 @@
|
||||
<url>http://maven.apache.org</url>
|
||||
<dependencies>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>morph</artifactId>
|
||||
|
@ -17,8 +17,6 @@ package org.apache.lucene.morphology.russian;
|
||||
|
||||
import org.apache.lucene.morphology.SuffixToLongException;
|
||||
import org.apache.lucene.morphology.WrongCharaterException;
|
||||
import static org.hamcrest.core.IsEqual.equalTo;
|
||||
import static org.junit.Assert.assertThat;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
@ -27,6 +25,9 @@ import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
import static org.hamcrest.core.IsEqual.equalTo;
|
||||
import static org.junit.Assert.assertThat;
|
||||
|
||||
public class RussianLetterDecoderEncoderTest {
|
||||
private RussianLetterDecoderEncoder decoderEncoder;
|
||||
|
||||
@ -37,7 +38,7 @@ public class RussianLetterDecoderEncoderTest {
|
||||
|
||||
|
||||
@Test
|
||||
public void testShouldPreserStringComporision() throws IOException {
|
||||
public void testShouldPreserverStringComporision() throws IOException {
|
||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-monotonic.txt");
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||
String s = bufferedReader.readLine();
|
||||
@ -52,22 +53,22 @@ public class RussianLetterDecoderEncoderTest {
|
||||
|
||||
|
||||
@Test
|
||||
public void testShouldCorretDecodeEncode() throws IOException {
|
||||
public void testShouldCorrectDecodeEncode() throws IOException {
|
||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data.txt");
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||
String s = bufferedReader.readLine();
|
||||
while (s != null) {
|
||||
String[] qa = s.trim().split(" ");
|
||||
if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) {
|
||||
Integer ecodedSuffix = decoderEncoder.encode(qa[0]);
|
||||
assertThat(decoderEncoder.decode(ecodedSuffix), equalTo(qa[1]));
|
||||
Integer encodedSuffix = decoderEncoder.encode(qa[0]);
|
||||
assertThat(decoderEncoder.decode(encodedSuffix), equalTo(qa[1]));
|
||||
}
|
||||
s = bufferedReader.readLine();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testShouldCorretDecodeEncodeStringToArray() throws IOException {
|
||||
public void testShouldCorrectDecodeEncodeStringToArray() throws IOException {
|
||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data-for-array.txt");
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||
String s = bufferedReader.readLine();
|
||||
@ -85,7 +86,7 @@ public class RussianLetterDecoderEncoderTest {
|
||||
}
|
||||
|
||||
@Test(expected = WrongCharaterException.class)
|
||||
public void shouldThrownExeptionIfSuffixContainWrongCharater() {
|
||||
public void shouldThrownExceptionIfSuffixContainWrongCharater() {
|
||||
decoderEncoder.encode("1");
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user