adding support of comparativ degree of adjective. Now it is treated as separete word form.
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@101 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
3de894404c
commit
ba5272acb8
@ -21,7 +21,10 @@ import java.io.BufferedReader;
|
|||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.util.*;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -32,14 +35,11 @@ public class DictionaryReader {
|
|||||||
private String fileName;
|
private String fileName;
|
||||||
private String fileEncoding = "windows-1251";
|
private String fileEncoding = "windows-1251";
|
||||||
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
|
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
|
||||||
private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
|
|
||||||
private Set<String> ignoredForm = new HashSet<String>();
|
private Set<String> ignoredForm = new HashSet<String>();
|
||||||
private List<WordFilter> filters = new ArrayList<WordFilter>();
|
|
||||||
|
|
||||||
public DictionaryReader(String fileName, Set<String> ignoredForm, List<WordFilter> filters) {
|
public DictionaryReader(String fileName, Set<String> ignoredForm) {
|
||||||
this.fileName = fileName;
|
this.fileName = fileName;
|
||||||
this.ignoredForm = ignoredForm;
|
this.ignoredForm = ignoredForm;
|
||||||
this.filters = filters;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -63,11 +63,6 @@ public class DictionaryReader {
|
|||||||
|
|
||||||
WordCard card = buildForm(s);
|
WordCard card = buildForm(s);
|
||||||
|
|
||||||
for (WordFilter wf : filters) {
|
|
||||||
if (card == null) break;
|
|
||||||
card = wf.transform(card);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (card == null) {
|
if (card == null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -112,8 +107,7 @@ public class DictionaryReader {
|
|||||||
String s = reader.readLine();
|
String s = reader.readLine();
|
||||||
int count = Integer.valueOf(s);
|
int count = Integer.valueOf(s);
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
s = reader.readLine();
|
reader.readLine();
|
||||||
wordPrefixes.add(Arrays.asList(s.toLowerCase().split(",")));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -135,7 +129,7 @@ public class DictionaryReader {
|
|||||||
// we inored all forms thats
|
// we inored all forms thats
|
||||||
if (fl.length == 3) {
|
if (fl.length == 3) {
|
||||||
//System.out.println(line);
|
//System.out.println(line);
|
||||||
// flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase())));
|
flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
|
||||||
}
|
}
|
||||||
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
|
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,44 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.morphology.dictionary;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
|
public class RemoveFlexiaWithPrefixes extends WordFilter {
|
||||||
|
|
||||||
|
public RemoveFlexiaWithPrefixes(WordProcessor wordProcessor) {
|
||||||
|
super(wordProcessor);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<WordCard> transform(WordCard wordCard) {
|
||||||
|
|
||||||
|
List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>();
|
||||||
|
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||||
|
if (fm.getPrefix().length() > 0) {
|
||||||
|
flexiaModelsToRemove.add(fm);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (FlexiaModel fm : flexiaModelsToRemove) {
|
||||||
|
wordCard.removeFlexia(fm);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new LinkedList<WordCard>(Arrays.asList(wordCard));
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,61 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.morphology.dictionary;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
|
public class RussianAdvSplitterFilter extends WordFilter {
|
||||||
|
private String code;
|
||||||
|
|
||||||
|
public RussianAdvSplitterFilter(WordProcessor wordProcessor) throws IOException {
|
||||||
|
super(wordProcessor);
|
||||||
|
code = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream("/russian-adv-main-code.txt"), "windows-1251")).readLine();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<WordCard> transform(WordCard wordCard) {
|
||||||
|
LinkedList<WordCard> result = new LinkedList<WordCard>();
|
||||||
|
result.add(wordCard);
|
||||||
|
|
||||||
|
String baseWord = "";
|
||||||
|
String canonicalForm = "";
|
||||||
|
String canonicalSuffix = "";
|
||||||
|
List<FlexiaModel> flexiaModels = new LinkedList<FlexiaModel>();
|
||||||
|
for (FlexiaModel flexiaModel : wordCard.getWordsForms()) {
|
||||||
|
if (flexiaModel.getPrefix().length() > 0) {
|
||||||
|
flexiaModels.add(new FlexiaModel(flexiaModel.getCode(), flexiaModel.getSuffix(), ""));
|
||||||
|
}
|
||||||
|
if (flexiaModel.getPrefix().length() > 0 && flexiaModel.getCode().equals(code)) {
|
||||||
|
baseWord = flexiaModel.getPrefix() + wordCard.getBase();
|
||||||
|
canonicalForm = flexiaModel.getCode();
|
||||||
|
canonicalSuffix = flexiaModel.getSuffix();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (baseWord.length() > 0) {
|
||||||
|
WordCard wc = new WordCard(canonicalForm, baseWord, canonicalSuffix);
|
||||||
|
wc.setWordsForms(flexiaModels);
|
||||||
|
result.add(wc);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
@ -17,23 +17,26 @@ package org.apache.lucene.morphology.dictionary;
|
|||||||
|
|
||||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
public class WordCleaner implements WordFilter {
|
public class WordCleaner extends WordFilter {
|
||||||
|
|
||||||
private LetterDecoderEncoder decoderEncoder;
|
private LetterDecoderEncoder decoderEncoder;
|
||||||
|
|
||||||
public WordCleaner(LetterDecoderEncoder decoderEncoder) {
|
public WordCleaner(LetterDecoderEncoder decoderEncoder, WordProcessor wordProcessor) {
|
||||||
|
super(wordProcessor);
|
||||||
this.decoderEncoder = decoderEncoder;
|
this.decoderEncoder = decoderEncoder;
|
||||||
}
|
}
|
||||||
|
|
||||||
public WordCard transform(WordCard wordCard) {
|
public List<WordCard> transform(WordCard wordCard) {
|
||||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||||
|
|
||||||
if (word.contains("-")) return null;
|
if (word.contains("-")) return Collections.emptyList();
|
||||||
if (!decoderEncoder.checkString(word)) return null;
|
if (!decoderEncoder.checkString(word)) return Collections.emptyList();
|
||||||
|
|
||||||
List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>();
|
List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>();
|
||||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||||
@ -45,6 +48,6 @@ public class WordCleaner implements WordFilter {
|
|||||||
wordCard.removeFlexia(fm);
|
wordCard.removeFlexia(fm);
|
||||||
}
|
}
|
||||||
|
|
||||||
return wordCard;
|
return new LinkedList<WordCard>(Arrays.asList(wordCard));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -16,9 +16,22 @@
|
|||||||
|
|
||||||
package org.apache.lucene.morphology.dictionary;
|
package org.apache.lucene.morphology.dictionary;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
public interface WordFilter {
|
|
||||||
|
|
||||||
public WordCard transform(WordCard wordCard);
|
abstract public class WordFilter implements WordProcessor {
|
||||||
|
private WordProcessor wordProcessor;
|
||||||
|
|
||||||
|
public WordFilter(WordProcessor wordProcessor) {
|
||||||
|
this.wordProcessor = wordProcessor;
|
||||||
|
}
|
||||||
|
|
||||||
|
abstract public List<WordCard> transform(WordCard wordCard);
|
||||||
|
|
||||||
|
public void process(WordCard wordCard) throws IOException {
|
||||||
|
for (WordCard wc : transform(wordCard)) {
|
||||||
|
wordProcessor.process(wc);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -17,18 +17,21 @@ package org.apache.lucene.morphology.dictionary;
|
|||||||
|
|
||||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
public class WordStringCleaner implements WordFilter {
|
public class WordStringCleaner extends WordFilter {
|
||||||
|
|
||||||
private LetterDecoderEncoder decoderEncoder;
|
private LetterDecoderEncoder decoderEncoder;
|
||||||
|
|
||||||
public WordStringCleaner(LetterDecoderEncoder decoderEncoder) {
|
public WordStringCleaner(LetterDecoderEncoder decoderEncoder, WordProcessor wordProcessor) {
|
||||||
|
super(wordProcessor);
|
||||||
this.decoderEncoder = decoderEncoder;
|
this.decoderEncoder = decoderEncoder;
|
||||||
}
|
}
|
||||||
|
|
||||||
public WordCard transform(WordCard wordCard) {
|
public List<WordCard> transform(WordCard wordCard) {
|
||||||
wordCard.setBase(cleanString(wordCard.getBase()));
|
wordCard.setBase(cleanString(wordCard.getBase()));
|
||||||
wordCard.setCanonicalForm(cleanString(wordCard.getCanonicalForm()));
|
wordCard.setCanonicalForm(cleanString(wordCard.getCanonicalForm()));
|
||||||
wordCard.setCanonicalSuffix(cleanString(wordCard.getCanonicalSuffix()));
|
wordCard.setCanonicalSuffix(cleanString(wordCard.getCanonicalSuffix()));
|
||||||
@ -39,7 +42,7 @@ public class WordStringCleaner implements WordFilter {
|
|||||||
//made correct code
|
//made correct code
|
||||||
m.setCode(m.getCode().substring(0, 2));
|
m.setCode(m.getCode().substring(0, 2));
|
||||||
}
|
}
|
||||||
return wordCard;
|
return new LinkedList<WordCard>(Arrays.asList(wordCard));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -20,9 +20,7 @@ import org.apache.lucene.morphology.EnglishLetterDecoderEncoder;
|
|||||||
import org.apache.lucene.morphology.dictionary.*;
|
import org.apache.lucene.morphology.dictionary.*;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
|
|
||||||
public class EnglishHeuristicBuilder {
|
public class EnglishHeuristicBuilder {
|
||||||
@ -30,12 +28,14 @@ public class EnglishHeuristicBuilder {
|
|||||||
|
|
||||||
GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/egramtab.tab");
|
GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/egramtab.tab");
|
||||||
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
||||||
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
|
||||||
|
|
||||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>(), filters);
|
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());
|
||||||
|
|
||||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder);
|
StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder);
|
||||||
dictionaryReader.process(statisticsCollector);
|
WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector);
|
||||||
|
WordStringCleaner wordStringCleaner = new WordStringCleaner(decoderEncoder, wordCleaner);
|
||||||
|
RemoveFlexiaWithPrefixes removeFlexiaWithPrefixes = new RemoveFlexiaWithPrefixes(wordStringCleaner);
|
||||||
|
dictionaryReader.process(removeFlexiaWithPrefixes);
|
||||||
statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
|
statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -20,21 +20,22 @@ import org.apache.lucene.morphology.dictionary.*;
|
|||||||
import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
|
import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
|
|
||||||
public class RussianHeuristicBuilder {
|
public class RussianHeuristicBuilder {
|
||||||
public static void main(String[] args) throws IOException {
|
public static void main(String[] args) throws IOException {
|
||||||
GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/rgramtab.tab");
|
GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/rgramtab.tab");
|
||||||
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
||||||
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
|
||||||
|
|
||||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>(), filters);
|
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
|
||||||
|
|
||||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder);
|
StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder);
|
||||||
dictionaryReader.process(statisticsCollector);
|
WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector);
|
||||||
|
WordStringCleaner wordStringCleaner = new WordStringCleaner(decoderEncoder, wordCleaner);
|
||||||
|
RemoveFlexiaWithPrefixes removeFlexiaWithPrefixes = new RemoveFlexiaWithPrefixes(wordStringCleaner);
|
||||||
|
RussianAdvSplitterFilter russianAdvSplitterFilter = new RussianAdvSplitterFilter(removeFlexiaWithPrefixes);
|
||||||
|
dictionaryReader.process(russianAdvSplitterFilter);
|
||||||
statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");
|
statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1 @@
|
|||||||
|
鞨
|
@ -24,7 +24,6 @@ import org.junit.Before;
|
|||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@ -71,15 +70,12 @@ public class TestAllWords {
|
|||||||
final List<String> morphInfo = grammarInfo.getGrammarInfo();
|
final List<String> morphInfo = grammarInfo.getGrammarInfo();
|
||||||
final Map<String, Integer> inversIndex = grammarInfo.getGrammarInverseIndex();
|
final Map<String, Integer> inversIndex = grammarInfo.getGrammarInverseIndex();
|
||||||
|
|
||||||
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet<String>());
|
||||||
|
|
||||||
|
|
||||||
DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet<String>(), filters);
|
|
||||||
|
|
||||||
final AtomicLong wordCount = new AtomicLong(0);
|
final AtomicLong wordCount = new AtomicLong(0);
|
||||||
Long startTime = System.currentTimeMillis();
|
Long startTime = System.currentTimeMillis();
|
||||||
|
|
||||||
dictionaryReader.process(new WordProcessor() {
|
WordProcessor wordProcessor = new WordProcessor() {
|
||||||
public void process(WordCard wordCard) throws IOException {
|
public void process(WordCard wordCard) throws IOException {
|
||||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||||
@ -90,7 +86,12 @@ public class TestAllWords {
|
|||||||
wordCount.set(2L + wordCount.get());
|
wordCount.set(2L + wordCount.get());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
};
|
||||||
|
|
||||||
|
WordCleaner wordCleaner = new WordCleaner(decoderEncoder, wordProcessor);
|
||||||
|
WordStringCleaner wordStringCleaner = new WordStringCleaner(decoderEncoder, wordCleaner);
|
||||||
|
RemoveFlexiaWithPrefixes removeFlexiaWithPrefixes = new RemoveFlexiaWithPrefixes(wordStringCleaner);
|
||||||
|
dictionaryReader.process(removeFlexiaWithPrefixes);
|
||||||
|
|
||||||
long time = System.currentTimeMillis() - startTime;
|
long time = System.currentTimeMillis() - startTime;
|
||||||
System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second");
|
System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second");
|
||||||
@ -101,10 +102,9 @@ public class TestAllWords {
|
|||||||
final LuceneMorphology morphology = new EnglishLuceneMorphology();
|
final LuceneMorphology morphology = new EnglishLuceneMorphology();
|
||||||
|
|
||||||
LetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
LetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
||||||
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
|
||||||
String pathToDic = prefix + "dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd";
|
String pathToDic = prefix + "dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd";
|
||||||
|
|
||||||
testAllWordForLucene(morphology, filters, pathToDic);
|
testAllWordForLucene(morphology, decoderEncoder, pathToDic);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@ -112,20 +112,19 @@ public class TestAllWords {
|
|||||||
final LuceneMorphology morphology = new RussianLuceneMorphology();
|
final LuceneMorphology morphology = new RussianLuceneMorphology();
|
||||||
|
|
||||||
LetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
LetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
||||||
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
|
||||||
|
|
||||||
String pathToDic = prefix + "dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd";
|
String pathToDic = prefix + "dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd";
|
||||||
|
|
||||||
testAllWordForLucene(morphology, filters, pathToDic);
|
testAllWordForLucene(morphology, decoderEncoder, pathToDic);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void testAllWordForLucene(final LuceneMorphology morphology, List<WordFilter> filters, String pathToDic) throws IOException {
|
private void testAllWordForLucene(final LuceneMorphology morphology, LetterDecoderEncoder decoderEncoder, String pathToDic) throws IOException {
|
||||||
final AtomicLong wordCount = new AtomicLong(0);
|
final AtomicLong wordCount = new AtomicLong(0);
|
||||||
Long startTime = System.currentTimeMillis();
|
Long startTime = System.currentTimeMillis();
|
||||||
|
|
||||||
DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<String>(), filters);
|
DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<String>());
|
||||||
dictionaryReader.process(new WordProcessor() {
|
WordProcessor wordProcessor = new WordProcessor() {
|
||||||
public void process(WordCard wordCard) throws IOException {
|
public void process(WordCard wordCard) throws IOException {
|
||||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||||
@ -134,7 +133,12 @@ public class TestAllWords {
|
|||||||
wordCount.set(1L + wordCount.get());
|
wordCount.set(1L + wordCount.get());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
};
|
||||||
|
|
||||||
|
WordCleaner wordCleaner = new WordCleaner(decoderEncoder, wordProcessor);
|
||||||
|
WordStringCleaner wordStringCleaner = new WordStringCleaner(decoderEncoder, wordCleaner);
|
||||||
|
RemoveFlexiaWithPrefixes removeFlexiaWithPrefixes = new RemoveFlexiaWithPrefixes(wordStringCleaner);
|
||||||
|
dictionaryReader.process(removeFlexiaWithPrefixes);
|
||||||
|
|
||||||
long time = System.currentTimeMillis() - startTime;
|
long time = System.currentTimeMillis() - startTime;
|
||||||
System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second");
|
System.out.println("Done " + wordCount.get() + " in " + time + " ms. " + wordCount.get() / (time / 1000L) + " word per second");
|
||||||
|
@ -17,3 +17,4 @@
|
|||||||
лучший хороший
|
лучший хороший
|
||||||
на на
|
на на
|
||||||
тест тест тесто
|
тест тест тесто
|
||||||
|
наибольшую наибольший
|
Loading…
x
Reference in New Issue
Block a user