adding test for all words, refactors test and dictonary reading

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@99 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
Alexander.A.Kuznetsov 2010-10-08 11:35:13 +00:00
parent e8399999c3
commit 76e68a11e0
26 changed files with 730 additions and 48 deletions

View File

@ -1,5 +1,6 @@
<?xml version="1.0"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<parent>
<artifactId>morphology</artifactId>
<groupId>org.apache.lucene.morphology</groupId>
@ -26,4 +27,6 @@
<version>0.9-SNAPSHOT</version>
</dependency>
</dependencies>
</project>

View File

@ -34,16 +34,19 @@ public class DictionaryReader {
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
private Set<String> ignoredForm = new HashSet<String>();
private List<WordFilter> filters = new ArrayList<WordFilter>();
public DictionaryReader(String fileName, Set<String> ignoredForm) {
public DictionaryReader(String fileName, Set<String> ignoredForm, List<WordFilter> filters) {
this.fileName = fileName;
this.ignoredForm = ignoredForm;
this.filters = filters;
}
public DictionaryReader(String fileName, String fileEncoding, Set<String> ignoredForm) {
public DictionaryReader(String fileName, String fileEncoding, Set<String> ignoredForm, List<WordFilter> filters) {
this.fileName = fileName;
this.fileEncoding = fileEncoding;
this.ignoredForm = ignoredForm;
this.filters = filters;
}
@ -60,30 +63,46 @@ public class DictionaryReader {
private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
String s = reader.readLine();
int count = Integer.valueOf(s);
int actual = 0;
for (int i = 0; i < count; i++) {
s = reader.readLine();
if (i % 10000 == 0) System.out.println("Proccess " + i + " wordBase of " + count);
WordCard card = buildForm(s);
for (WordFilter wf : filters) {
if (card == null) break;
card = wf.transform(card);
}
if (card == null) {
continue;
}
wordProccessor.process(card);
actual++;
}
System.out.println("Finished word processing actual words " + actual);
}
private WordCard buildForm(String s) {
String[] wd = s.split(" ");
String wordBase = wd[0].toLowerCase();
if (wordBase.startsWith("-")) continue;
if (wordBase.startsWith("-")) return null;
wordBase = "#".equals(wordBase) ? "" : wordBase;
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
FlexiaModel flexiaModel = models.get(0);
if (models.size() > 0 && !ignoredForm.contains(flexiaModel.getCode())) {
if (models.size() == 0 || ignoredForm.contains(flexiaModel.getCode())) {
return null;
}
WordCard card = new WordCard(flexiaModel.create(wordBase), wordBase, flexiaModel.getSuffix());
for (FlexiaModel fm : models) {
card.addFlexia(fm);
}
// if(card.getBase().equals("face") || card.getBase().equals("fac")){
// System.out.println(models);
// System.out.println(card);
wordProccessor.process(card);
//}
}
}
return card;
}
@ -122,7 +141,7 @@ public class DictionaryReader {
String[] fl = line.split("\\*");
// we inored all forms thats
if (fl.length == 3) {
System.out.println(line);
//System.out.println(line);
// flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase())));
}
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));

View File

@ -66,4 +66,26 @@ public class FlexiaModel {
", prefix='" + prefix + '\'' +
'}';
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
FlexiaModel that = (FlexiaModel) o;
if (code != null ? !code.equals(that.code) : that.code != null) return false;
if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false;
if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false;
return true;
}
@Override
public int hashCode() {
int result = code != null ? code.hashCode() : 0;
result = 31 * result + (suffix != null ? suffix.hashCode() : 0);
result = 31 * result + (prefix != null ? prefix.hashCode() : 0);
return result;
}
}

View File

@ -42,12 +42,8 @@ public class StatisticsCollector implements WordProccessor {
public void process(WordCard wordCard) throws IOException {
cleanWordCard(wordCard);
String normalStringMorph = wordCard.getWordsForms().get(0).getCode();
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
if (word.contains("-")) return;
if (!decoderEncoder.checkString(word)) return;
for (FlexiaModel fm : wordCard.getWordsForms()) {
if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) continue;
Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph);
String form = revertWord(fm.create(wordCard.getBase()));
Set<Heuristic> suffixHeuristics = inverseIndex.get(form);
@ -138,8 +134,8 @@ public class StatisticsCollector implements WordProccessor {
Integer length = getCommonLength(form, normalForm);
Integer actualSuffixLengh = form.length() - length;
String actualNormalSuffix = normalForm.substring(length);
Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode().substring(0, 2));
Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm.substring(0, 2));
Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode());
Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm);
return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue());
}

View File

@ -38,6 +38,10 @@ public class WordCard {
wordsForms.add(flexiaModel);
}
public void removeFlexia(FlexiaModel flexiaModel) {
wordsForms.remove(flexiaModel);
}
public String getCanonicalForm() {
return canonicalForm;
}

View File

@ -0,0 +1,50 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.dictionary;
import org.apache.lucene.morphology.LetterDecoderEncoder;
import java.util.LinkedList;
import java.util.List;
public class WordCleaner implements WordFilter {
private LetterDecoderEncoder decoderEncoder;
public WordCleaner(LetterDecoderEncoder decoderEncoder) {
this.decoderEncoder = decoderEncoder;
}
public WordCard transform(WordCard wordCard) {
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
if (word.contains("-")) return null;
if (!decoderEncoder.checkString(word)) return null;
List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>();
for (FlexiaModel fm : wordCard.getWordsForms()) {
if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) {
flexiaModelsToRemove.add(fm);
}
}
for (FlexiaModel fm : flexiaModelsToRemove) {
wordCard.removeFlexia(fm);
}
return wordCard;
}
}

View File

@ -0,0 +1,24 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.dictionary;
public interface WordFilter {
public WordCard transform(WordCard wordCard);
}

View File

@ -0,0 +1,49 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.dictionary;
import org.apache.lucene.morphology.LetterDecoderEncoder;
import java.util.List;
public class WordStringCleaner implements WordFilter {
private LetterDecoderEncoder decoderEncoder;
public WordStringCleaner(LetterDecoderEncoder decoderEncoder) {
this.decoderEncoder = decoderEncoder;
}
public WordCard transform(WordCard wordCard) {
wordCard.setBase(cleanString(wordCard.getBase()));
wordCard.setCanonicalForm(cleanString(wordCard.getCanonicalForm()));
wordCard.setCanonicalSuffix(cleanString(wordCard.getCanonicalSuffix()));
List<FlexiaModel> models = wordCard.getWordsForms();
for (FlexiaModel m : models) {
m.setSuffix(cleanString(m.getSuffix()));
m.setPrefix(cleanString(m.getPrefix()));
//made correct code
m.setCode(m.getCode().substring(0, 2));
}
return wordCard;
}
private String cleanString(String s) {
return decoderEncoder.cleanString(s);
}
}

View File

@ -16,22 +16,24 @@
package org.apache.lucene.morphology.generator;
import org.apache.lucene.morphology.dictionary.DictionaryReader;
import org.apache.lucene.morphology.dictionary.GrammaReader;
import org.apache.lucene.morphology.dictionary.StatisticsCollector;
import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder;
import org.apache.lucene.morphology.EnglishLetterDecoderEncoder;
import org.apache.lucene.morphology.dictionary.*;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
public class EnglishHeuristicBuilder {
public static void main(String[] args) throws IOException {
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/egramtab.tab");
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>(), filters);
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
dictionaryReader.proccess(statisticsCollector);
statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");

View File

@ -16,21 +16,23 @@
package org.apache.lucene.morphology.generator;
import org.apache.lucene.morphology.dictionary.DictionaryReader;
import org.apache.lucene.morphology.dictionary.GrammaReader;
import org.apache.lucene.morphology.dictionary.StatisticsCollector;
import org.apache.lucene.morphology.dictionary.*;
import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
public class RussianHeuristicBuilder {
public static void main(String[] args) throws IOException {
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>(), filters);
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
dictionaryReader.proccess(statisticsCollector);
statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");

View File

@ -0,0 +1,144 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene;
import org.apache.lucene.morphology.*;
import org.apache.lucene.morphology.dictionary.*;
import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
import org.apache.lucene.morphology.russian.RussianMorphology;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import static org.hamcrest.Matchers.hasItem;
import static org.junit.Assert.assertThat;
public class TestAllWords {
String prefix = "";
@Before
public void setUp() {
System.out.println(System.getProperty("user.dir"));
prefix = System.getProperty("user.dir").endsWith("dictionary-reader") ? "../" : "";
}
@Test
public void shouldEnglishMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException {
final Morphology morphology = new EnglishMorphology();
LetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
String pathToGramma = prefix + "dictonary/Dicts/Morph/egramtab.tab";
String pathToDict = prefix + "dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd";
testFullGramma(morphology, decoderEncoder, pathToGramma, pathToDict);
}
@Test
public void shouldRussianMorphologyIncludeAllWordsFormsWithMorphInfo() throws IOException {
final Morphology morphology = new RussianMorphology();
LetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
String pathToGramma = prefix + "dictonary/Dicts/Morph/rgramtab.tab";
String pathToDict = prefix + "dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd";
testFullGramma(morphology, decoderEncoder, pathToGramma, pathToDict);
}
private void testFullGramma(final Morphology morphology, LetterDecoderEncoder decoderEncoder, String pathToGramma, String pathToDict) throws IOException {
GrammaReader grammaInfo = new GrammaReader(pathToGramma);
final List<String> morphInfo = grammaInfo.getGrammaInfo();
final Map<String, Integer> inversIndex = grammaInfo.getGrammInversIndex();
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet<String>(), filters);
final AtomicLong wordCount = new AtomicLong(0);
Long startTime = System.currentTimeMillis();
dictionaryReader.proccess(new WordProccessor() {
public void process(WordCard wordCard) throws IOException {
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
for (FlexiaModel fm : wordCard.getWordsForms()) {
String wordForm = wordCard.getBase() + fm.getSuffix();
String morph = morphInfo.get(inversIndex.get(fm.getCode()));
assertThat(morphology.getMorphInfo(wordForm), hasItem(word + "|" + morph));
assertThat(morphology.getNormalForms(wordForm), hasItem(word));
wordCount.set(2L + wordCount.get());
}
}
});
long time = System.currentTimeMillis() - startTime;
System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second");
}
@Test
public void shouldEnglishLuceneMorphologyIncludeAllWords() throws IOException {
final LuceneMorphology morphology = new EnglishLuceneMorphology();
LetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
String pathToDic = prefix + "dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd";
testAllWordForLucene(morphology, filters, pathToDic);
}
@Test
public void shouldIncludeAllWordsRussianInLuceneMorophology() throws IOException {
final LuceneMorphology morphology = new RussianLuceneMorphology();
LetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
String pathToDic = prefix + "dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd";
testAllWordForLucene(morphology, filters, pathToDic);
}
private void testAllWordForLucene(final LuceneMorphology morphology, List<WordFilter> filters, String pathToDic) throws IOException {
final AtomicLong wordCount = new AtomicLong(0);
Long startTime = System.currentTimeMillis();
DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<String>(), filters);
dictionaryReader.proccess(new WordProccessor() {
public void process(WordCard wordCard) throws IOException {
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
for (FlexiaModel fm : wordCard.getWordsForms()) {
String wordForm = wordCard.getBase() + fm.getSuffix();
assertThat(morphology.getNormalForms(wordForm), hasItem(word));
wordCount.set(1L + wordCount.get());
}
}
});
long time = System.currentTimeMillis() - startTime;
System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second");
}
}

View File

@ -0,0 +1,77 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.morphology.russian.RussianAnalyzer;
import org.junit.Test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashSet;
import static org.hamcrest.Matchers.equalTo;
import static org.junit.Assert.assertThat;
public class AnalayzersTest {
@Test
public void englishAnalyzerShouldGiveCorrectWords() throws IOException {
Analyzer morphlogyAnalyzer = new EnglishAnalyzer();
String answerPath = "/english/englsih-analayzer-answer.txt";
String testPath = "/english/englsih-analayzer-data.txt";
testAnalayzer(morphlogyAnalyzer, answerPath, testPath);
}
@Test
public void shoudGiveCorretWords() throws IOException {
Analyzer morphlogyAnalyzer = new RussianAnalyzer();
String answerPath = "/russian/russian-analayzer-answer.txt";
String testPath = "/russian/russian-analayzer-data.txt";
testAnalayzer(morphlogyAnalyzer, answerPath, testPath);
}
private void testAnalayzer(Analyzer morphlogyAnalyzer, String answerPath, String testPath) throws IOException {
InputStream stream = this.getClass().getResourceAsStream(answerPath);
BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
HashSet<String> answer = new HashSet<String>(Arrays.asList(strings));
stream.close();
stream = this.getClass().getResourceAsStream(testPath);
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
HashSet<String> result = new HashSet<String>();
while (tokenStream.incrementToken()) {
TermAttribute attribute1 = tokenStream.getAttribute(TermAttribute.class);
result.add(attribute1.term());
}
stream.close();
assertThat(result, equalTo(answer));
}
}

View File

@ -0,0 +1,62 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
import org.junit.Test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import static org.hamcrest.CoreMatchers.equalTo;
import static org.junit.Assert.assertThat;
public class LuceneMorphTest {
@Test
public void englishMorphologyShouldGetCorrectNormalForm() throws IOException {
LuceneMorphology luceneMorph = new EnglishLuceneMorphology();
String pathToTestData = "/english/english-morphology-test.txt";
testMorphology(luceneMorph, pathToTestData);
}
@Test
public void russianMorphologyShouldGetCorrectNormalForm() throws IOException {
LuceneMorphology luceneMorph = new RussianLuceneMorphology();
String pathToTestData = "/russian/russian-morphology-test.txt";
testMorphology(luceneMorph, pathToTestData);
}
private void testMorphology(LuceneMorphology luceneMorph, String pathToTestData) throws IOException {
InputStream stream = this.getClass().getResourceAsStream(pathToTestData);
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
String s = bufferedReader.readLine();
while (s != null) {
String[] qa = s.trim().split(" ");
Set<String> result = new HashSet<String>();
result.addAll(Arrays.asList(qa).subList(1, qa.length));
Set<String> stringList = new HashSet<String>(luceneMorph.getNormalForms(qa[0]));
assertThat(stringList, equalTo(result));
s = bufferedReader.readLine();
}
}
}

View File

@ -0,0 +1,8 @@
purchases purchase
existing exist
was be
men man
bore bore bear
grown grow grown
came come
md md

View File

@ -0,0 +1 @@
following follow the instruction exactly will be help ensure the best well good result

View File

@ -0,0 +1 @@
Following the instructions exactly will help ensure the best results

View File

@ -0,0 +1 @@
в результат крушение погибнуть командир отряд специальный назначение пря при переть гувд ростовский область полковник милиция михаил перов и предприниматель

View File

@ -0,0 +1 @@
В результате крушения погибли командир отряда специального назначения при ГУВД Ростовской области полковник милиции Михаил Перов и предприниматель

View File

@ -0,0 +1,19 @@
еду еда ехать
тестов тест
вина вино вина
вино вино
ехать ехать
ананасов ананас ананасовый
сухой сухой
дураков дурак
пушка пушка пушок
пушок пушок
пушек пушка
козлов козлов козловый козел
жуков жуков жук
красив красить красивый
красивая красивый
тосклив тоскливый
лучший хороший
на на
тест тест тесто

View File

@ -1,5 +1,6 @@
<?xml version="1.0"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<parent>
<artifactId>morphology</artifactId>
<groupId>org.apache.lucene.morphology</groupId>
@ -12,6 +13,7 @@
<version>0.9-SNAPSHOT</version>
<url>http://maven.apache.org</url>
<dependencies>
<dependency>
<groupId>org.apache.lucene.morphology</groupId>
<artifactId>morph</artifactId>

View File

@ -0,0 +1,29 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer;
import java.io.IOException;
public class EnglishAnalyzer extends MorphologyAnalyzer {
public EnglishAnalyzer() throws IOException {
super(new EnglishLuceneMorphology());
}
}

View File

@ -0,0 +1,111 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import java.util.ArrayList;
public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder {
public static final int ENGLISH_SMALL_LETTER_OFFSET = 96;
static public int SUFFIX_LENGTH = 6;
public static final int DASH_CHAR = 45;
public static final int DASH_CODE = 27;
public Integer encode(String string) {
if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12);
int result = 0;
for (int i = 0; i < string.length(); i++) {
int c = 0 + string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET;
if (c == 45 - ENGLISH_SMALL_LETTER_OFFSET) {
c = DASH_CODE;
}
if (c < 0 || c > 27)
throw new WrongCharaterException("Symblo " + string.charAt(i) + " is not small cirillic letter");
result = result * 28 + c;
}
for (int i = string.length(); i < 6; i++) {
result *= 28;
}
return result;
}
public int[] encodeToArray(String s) {
ArrayList<Integer> integers = new ArrayList<Integer>();
while (s.length() > 6) {
integers.add(encode(s.substring(0, 6)));
s = s.substring(6);
}
integers.add(encode(s));
int[] ints = new int[integers.size()];
int pos = 0;
for (Integer i : integers) {
ints[pos] = i;
pos++;
}
return ints;
}
public String decodeArray(int[] array) {
String result = "";
for (int i : array) {
result += decode(i);
}
return result;
}
public String decode(Integer suffixN) {
String result = "";
while (suffixN > 27) {
int c = suffixN % 28 + ENGLISH_SMALL_LETTER_OFFSET;
if (c == ENGLISH_SMALL_LETTER_OFFSET) {
suffixN /= 28;
continue;
}
if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
result = (char) c + result;
suffixN /= 28;
}
long c = suffixN + ENGLISH_SMALL_LETTER_OFFSET;
if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
result = (char) c + result;
return result;
}
public boolean checkCharacter(char c) {
int code = 0 + c;
if (code == 45) return true;
code -= ENGLISH_SMALL_LETTER_OFFSET;
if (code > 0 && code < 27) return true;
return false;
}
public boolean checkString(String word) {
for (int i = 0; i < word.length(); i++) {
if (!checkCharacter(word.charAt(i))) {
return false;
}
}
return true;
}
public String cleanString(String s) {
return s;
}
}

View File

@ -0,0 +1,26 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import java.io.IOException;
public class EnglishLuceneMorphology extends LuceneMorphology {
public EnglishLuceneMorphology() throws IOException {
super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
}
}

View File

@ -0,0 +1,26 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology;
import java.io.IOException;
public class EnglishMorphology extends MorphologyImpl {
public EnglishMorphology() throws IOException {
super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
}
}

View File

@ -1,5 +1,6 @@
<?xml version="1.0"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<parent>
<artifactId>morphology</artifactId>
<groupId>org.apache.lucene.morphology</groupId>
@ -13,6 +14,7 @@
<url>http://maven.apache.org</url>
<dependencies>
<dependency>
<groupId>org.apache.lucene.morphology</groupId>
<artifactId>morph</artifactId>

View File

@ -17,8 +17,6 @@ package org.apache.lucene.morphology.russian;
import org.apache.lucene.morphology.SuffixToLongException;
import org.apache.lucene.morphology.WrongCharaterException;
import static org.hamcrest.core.IsEqual.equalTo;
import static org.junit.Assert.assertThat;
import org.junit.Before;
import org.junit.Test;
@ -27,6 +25,9 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import static org.hamcrest.core.IsEqual.equalTo;
import static org.junit.Assert.assertThat;
public class RussianLetterDecoderEncoderTest {
private RussianLetterDecoderEncoder decoderEncoder;
@ -37,7 +38,7 @@ public class RussianLetterDecoderEncoderTest {
@Test
public void testShouldPreserStringComporision() throws IOException {
public void testShouldPreserverStringComporision() throws IOException {
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-monotonic.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
String s = bufferedReader.readLine();
@ -52,22 +53,22 @@ public class RussianLetterDecoderEncoderTest {
@Test
public void testShouldCorretDecodeEncode() throws IOException {
public void testShouldCorrectDecodeEncode() throws IOException {
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
String s = bufferedReader.readLine();
while (s != null) {
String[] qa = s.trim().split(" ");
if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) {
Integer ecodedSuffix = decoderEncoder.encode(qa[0]);
assertThat(decoderEncoder.decode(ecodedSuffix), equalTo(qa[1]));
Integer encodedSuffix = decoderEncoder.encode(qa[0]);
assertThat(decoderEncoder.decode(encodedSuffix), equalTo(qa[1]));
}
s = bufferedReader.readLine();
}
}
@Test
public void testShouldCorretDecodeEncodeStringToArray() throws IOException {
public void testShouldCorrectDecodeEncodeStringToArray() throws IOException {
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data-for-array.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
String s = bufferedReader.readLine();
@ -85,7 +86,7 @@ public class RussianLetterDecoderEncoderTest {
}
@Test(expected = WrongCharaterException.class)
public void shouldThrownExeptionIfSuffixContainWrongCharater() {
public void shouldThrownExceptionIfSuffixContainWrongCharater() {
decoderEncoder.encode("1");
}
}