fixing some spelling errors
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@100 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
76e68a11e0
commit
3de894404c
@ -42,25 +42,18 @@ public class DictionaryReader {
|
|||||||
this.filters = filters;
|
this.filters = filters;
|
||||||
}
|
}
|
||||||
|
|
||||||
public DictionaryReader(String fileName, String fileEncoding, Set<String> ignoredForm, List<WordFilter> filters) {
|
|
||||||
this.fileName = fileName;
|
|
||||||
this.fileEncoding = fileEncoding;
|
|
||||||
this.ignoredForm = ignoredForm;
|
|
||||||
this.filters = filters;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
public void process(WordProcessor wordProcessor) throws IOException {
|
||||||
public void proccess(WordProccessor wordProccessor) throws IOException {
|
|
||||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), fileEncoding));
|
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), fileEncoding));
|
||||||
readFlexias(bufferedReader);
|
readFlexias(bufferedReader);
|
||||||
sckipBlock(bufferedReader);
|
skipBlock(bufferedReader);
|
||||||
sckipBlock(bufferedReader);
|
skipBlock(bufferedReader);
|
||||||
readPrefix(bufferedReader);
|
readPrefix(bufferedReader);
|
||||||
readWords(bufferedReader, wordProccessor);
|
readWords(bufferedReader, wordProcessor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
|
private void readWords(BufferedReader reader, WordProcessor wordProcessor) throws IOException {
|
||||||
String s = reader.readLine();
|
String s = reader.readLine();
|
||||||
int count = Integer.valueOf(s);
|
int count = Integer.valueOf(s);
|
||||||
int actual = 0;
|
int actual = 0;
|
||||||
@ -79,7 +72,7 @@ public class DictionaryReader {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
wordProccessor.process(card);
|
wordProcessor.process(card);
|
||||||
actual++;
|
actual++;
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -106,11 +99,11 @@ public class DictionaryReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void sckipBlock(BufferedReader reader) throws IOException {
|
private void skipBlock(BufferedReader reader) throws IOException {
|
||||||
String s = reader.readLine();
|
String s = reader.readLine();
|
||||||
int count = Integer.valueOf(s);
|
int count = Integer.valueOf(s);
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
s = reader.readLine();
|
reader.readLine();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -25,19 +25,19 @@ import java.util.HashMap;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
//todo spleet this class on two.
|
|
||||||
public class GrammaReader {
|
public class GrammarReader {
|
||||||
private String fileName;
|
private String fileName;
|
||||||
private String fileEncoding = "windows-1251";
|
private String fileEncoding = "windows-1251";
|
||||||
private List<String> grammaInfo = new ArrayList<String>();
|
private List<String> grammarInfo = new ArrayList<String>();
|
||||||
private Map<String, Integer> inverseIndex = new HashMap<String, Integer>();
|
private Map<String, Integer> inverseIndex = new HashMap<String, Integer>();
|
||||||
|
|
||||||
public GrammaReader(String fileName) throws IOException {
|
public GrammarReader(String fileName) throws IOException {
|
||||||
this.fileName = fileName;
|
this.fileName = fileName;
|
||||||
setUp();
|
setUp();
|
||||||
}
|
}
|
||||||
|
|
||||||
public GrammaReader(String fileName, String fileEncoding) throws IOException {
|
public GrammarReader(String fileName, String fileEncoding) throws IOException {
|
||||||
this.fileName = fileName;
|
this.fileName = fileName;
|
||||||
this.fileEncoding = fileEncoding;
|
this.fileEncoding = fileEncoding;
|
||||||
setUp();
|
setUp();
|
||||||
@ -50,27 +50,23 @@ public class GrammaReader {
|
|||||||
line = line.trim();
|
line = line.trim();
|
||||||
if (!line.startsWith("//") && line.length() > 0) {
|
if (!line.startsWith("//") && line.length() > 0) {
|
||||||
String[] strings = line.split(" ", 2);
|
String[] strings = line.split(" ", 2);
|
||||||
Integer i = grammaInfo.size();
|
Integer i = grammarInfo.size();
|
||||||
inverseIndex.put(strings[0], i);
|
inverseIndex.put(strings[0], i);
|
||||||
grammaInfo.add(i, strings[1]);
|
grammarInfo.add(i, strings[1]);
|
||||||
}
|
}
|
||||||
line = bufferedReader.readLine();
|
line = bufferedReader.readLine();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getGrammaInfo() {
|
public List<String> getGrammarInfo() {
|
||||||
return grammaInfo;
|
return grammarInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String[] getGrammaInfoAsArray() {
|
public String[] getGrammarInfoAsArray() {
|
||||||
return grammaInfo.toArray(new String[grammaInfo.size()]);
|
return grammarInfo.toArray(new String[grammarInfo.size()]);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, Integer> getGrammInversIndex() {
|
public Map<String, Integer> getGrammarInverseIndex() {
|
||||||
return inverseIndex;
|
return inverseIndex;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setInverseIndex(Map<String, Integer> inverseIndex) {
|
|
||||||
this.inverseIndex = inverseIndex;
|
|
||||||
}
|
|
||||||
}
|
}
|
@ -26,16 +26,16 @@ import java.util.*;
|
|||||||
|
|
||||||
|
|
||||||
//todo made refactoring this class
|
//todo made refactoring this class
|
||||||
public class StatisticsCollector implements WordProccessor {
|
public class StatisticsCollector implements WordProcessor {
|
||||||
private TreeMap<String, Set<Heuristic>> inverseIndex = new TreeMap<String, Set<Heuristic>>();
|
private TreeMap<String, Set<Heuristic>> inverseIndex = new TreeMap<String, Set<Heuristic>>();
|
||||||
private Map<Set<Heuristic>, Integer> ruleInverseIndex = new HashMap<Set<Heuristic>, Integer>();
|
private Map<Set<Heuristic>, Integer> ruleInverseIndex = new HashMap<Set<Heuristic>, Integer>();
|
||||||
private List<Set<Heuristic>> rules = new ArrayList<Set<Heuristic>>();
|
private List<Set<Heuristic>> rules = new ArrayList<Set<Heuristic>>();
|
||||||
private GrammaReader grammaReader;
|
private GrammarReader grammarReader;
|
||||||
private LetterDecoderEncoder decoderEncoder;
|
private LetterDecoderEncoder decoderEncoder;
|
||||||
|
|
||||||
|
|
||||||
public StatisticsCollector(GrammaReader grammaReader, LetterDecoderEncoder decoderEncoder) {
|
public StatisticsCollector(GrammarReader grammarReader, LetterDecoderEncoder decoderEncoder) {
|
||||||
this.grammaReader = grammaReader;
|
this.grammarReader = grammarReader;
|
||||||
this.decoderEncoder = decoderEncoder;
|
this.decoderEncoder = decoderEncoder;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -115,7 +115,7 @@ public class StatisticsCollector implements WordProccessor {
|
|||||||
prevSet = currentSet;
|
prevSet = currentSet;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
MorphologyImpl morphology = new MorphologyImpl(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray());
|
MorphologyImpl morphology = new MorphologyImpl(ints, rulesId, heuristics, grammarReader.getGrammarInfoAsArray());
|
||||||
morphology.writeToFile(fileName);
|
morphology.writeToFile(fileName);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -134,8 +134,8 @@ public class StatisticsCollector implements WordProccessor {
|
|||||||
Integer length = getCommonLength(form, normalForm);
|
Integer length = getCommonLength(form, normalForm);
|
||||||
Integer actualSuffixLengh = form.length() - length;
|
Integer actualSuffixLengh = form.length() - length;
|
||||||
String actualNormalSuffix = normalForm.substring(length);
|
String actualNormalSuffix = normalForm.substring(length);
|
||||||
Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode());
|
Integer integer = grammarReader.getGrammarInverseIndex().get(fm.getCode());
|
||||||
Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm);
|
Integer nf = grammarReader.getGrammarInverseIndex().get(normalSuffixForm);
|
||||||
return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue());
|
return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20,9 +20,8 @@ import java.io.IOException;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Interface allows get information from
|
* Interface allows get information from
|
||||||
* {@org.apache.lucene.russian.morphology.dictonary.DirtonaryReader}.
|
|
||||||
*/
|
*/
|
||||||
public interface WordProccessor {
|
public interface WordProcessor {
|
||||||
|
|
||||||
public void process(WordCard wordCard) throws IOException;
|
public void process(WordCard wordCard) throws IOException;
|
||||||
}
|
}
|
@ -28,14 +28,14 @@ import java.util.List;
|
|||||||
public class EnglishHeuristicBuilder {
|
public class EnglishHeuristicBuilder {
|
||||||
public static void main(String[] args) throws IOException {
|
public static void main(String[] args) throws IOException {
|
||||||
|
|
||||||
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/egramtab.tab");
|
GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/egramtab.tab");
|
||||||
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
||||||
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
||||||
|
|
||||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>(), filters);
|
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>(), filters);
|
||||||
|
|
||||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
|
StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder);
|
||||||
dictionaryReader.proccess(statisticsCollector);
|
dictionaryReader.process(statisticsCollector);
|
||||||
statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
|
statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -27,14 +27,14 @@ import java.util.List;
|
|||||||
|
|
||||||
public class RussianHeuristicBuilder {
|
public class RussianHeuristicBuilder {
|
||||||
public static void main(String[] args) throws IOException {
|
public static void main(String[] args) throws IOException {
|
||||||
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
|
GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/rgramtab.tab");
|
||||||
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
||||||
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
||||||
|
|
||||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>(), filters);
|
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>(), filters);
|
||||||
|
|
||||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
|
StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder);
|
||||||
dictionaryReader.proccess(statisticsCollector);
|
dictionaryReader.process(statisticsCollector);
|
||||||
statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");
|
statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -67,9 +67,9 @@ public class TestAllWords {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void testFullGramma(final Morphology morphology, LetterDecoderEncoder decoderEncoder, String pathToGramma, String pathToDict) throws IOException {
|
private void testFullGramma(final Morphology morphology, LetterDecoderEncoder decoderEncoder, String pathToGramma, String pathToDict) throws IOException {
|
||||||
GrammaReader grammaInfo = new GrammaReader(pathToGramma);
|
GrammarReader grammarInfo = new GrammarReader(pathToGramma);
|
||||||
final List<String> morphInfo = grammaInfo.getGrammaInfo();
|
final List<String> morphInfo = grammarInfo.getGrammarInfo();
|
||||||
final Map<String, Integer> inversIndex = grammaInfo.getGrammInversIndex();
|
final Map<String, Integer> inversIndex = grammarInfo.getGrammarInverseIndex();
|
||||||
|
|
||||||
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
||||||
|
|
||||||
@ -79,7 +79,7 @@ public class TestAllWords {
|
|||||||
final AtomicLong wordCount = new AtomicLong(0);
|
final AtomicLong wordCount = new AtomicLong(0);
|
||||||
Long startTime = System.currentTimeMillis();
|
Long startTime = System.currentTimeMillis();
|
||||||
|
|
||||||
dictionaryReader.proccess(new WordProccessor() {
|
dictionaryReader.process(new WordProcessor() {
|
||||||
public void process(WordCard wordCard) throws IOException {
|
public void process(WordCard wordCard) throws IOException {
|
||||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||||
@ -125,7 +125,7 @@ public class TestAllWords {
|
|||||||
Long startTime = System.currentTimeMillis();
|
Long startTime = System.currentTimeMillis();
|
||||||
|
|
||||||
DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<String>(), filters);
|
DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<String>(), filters);
|
||||||
dictionaryReader.proccess(new WordProccessor() {
|
dictionaryReader.process(new WordProcessor() {
|
||||||
public void process(WordCard wordCard) throws IOException {
|
public void process(WordCard wordCard) throws IOException {
|
||||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||||
|
@ -32,13 +32,13 @@ import static org.hamcrest.Matchers.equalTo;
|
|||||||
import static org.junit.Assert.assertThat;
|
import static org.junit.Assert.assertThat;
|
||||||
|
|
||||||
|
|
||||||
public class AnalayzersTest {
|
public class AnalyzersTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void englishAnalyzerShouldGiveCorrectWords() throws IOException {
|
public void englishAnalyzerShouldGiveCorrectWords() throws IOException {
|
||||||
Analyzer morphlogyAnalyzer = new EnglishAnalyzer();
|
Analyzer morphlogyAnalyzer = new EnglishAnalyzer();
|
||||||
String answerPath = "/english/englsih-analayzer-answer.txt";
|
String answerPath = "/english/english-analyzer-answer.txt";
|
||||||
String testPath = "/english/englsih-analayzer-data.txt";
|
String testPath = "/english/english-analyzer-data.txt";
|
||||||
|
|
||||||
testAnalayzer(morphlogyAnalyzer, answerPath, testPath);
|
testAnalayzer(morphlogyAnalyzer, answerPath, testPath);
|
||||||
}
|
}
|
||||||
@ -46,8 +46,8 @@ public class AnalayzersTest {
|
|||||||
@Test
|
@Test
|
||||||
public void shoudGiveCorretWords() throws IOException {
|
public void shoudGiveCorretWords() throws IOException {
|
||||||
Analyzer morphlogyAnalyzer = new RussianAnalyzer();
|
Analyzer morphlogyAnalyzer = new RussianAnalyzer();
|
||||||
String answerPath = "/russian/russian-analayzer-answer.txt";
|
String answerPath = "/russian/russian-analyzer-answer.txt";
|
||||||
String testPath = "/russian/russian-analayzer-data.txt";
|
String testPath = "/russian/russian-analyzer-data.txt";
|
||||||
|
|
||||||
testAnalayzer(morphlogyAnalyzer, answerPath, testPath);
|
testAnalayzer(morphlogyAnalyzer, answerPath, testPath);
|
||||||
}
|
}
|
@ -1,58 +0,0 @@
|
|||||||
/**
|
|
||||||
* Copyright 2009 Alexander Kuznetsov
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.morphology.english;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import static org.hamcrest.Matchers.equalTo;
|
|
||||||
import static org.junit.Assert.assertThat;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashSet;
|
|
||||||
|
|
||||||
|
|
||||||
public class EnglishAnalayzerTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void shouldGiveCorrectWords() throws IOException {
|
|
||||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-answer.txt");
|
|
||||||
BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
|
||||||
String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
|
|
||||||
HashSet<String> answer = new HashSet<String>(Arrays.asList(strings));
|
|
||||||
stream.close();
|
|
||||||
|
|
||||||
EnglishAnalyzer morphlogyAnalyzer = new EnglishAnalyzer();
|
|
||||||
stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-data.txt");
|
|
||||||
|
|
||||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
|
||||||
|
|
||||||
TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
|
|
||||||
HashSet<String> result = new HashSet<String>();
|
|
||||||
while (tokenStream.incrementToken()) {
|
|
||||||
TermAttribute attribute1 = tokenStream.getAttribute(TermAttribute.class);
|
|
||||||
result.add(attribute1.term());
|
|
||||||
}
|
|
||||||
|
|
||||||
stream.close();
|
|
||||||
|
|
||||||
assertThat(result, equalTo(answer));
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,55 +0,0 @@
|
|||||||
/**
|
|
||||||
* Copyright 2009 Alexander Kuznetsov
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.morphology.english;
|
|
||||||
|
|
||||||
import org.apache.lucene.morphology.LuceneMorphology;
|
|
||||||
import static org.hamcrest.Matchers.equalTo;
|
|
||||||
import static org.junit.Assert.assertThat;
|
|
||||||
import org.junit.Before;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
public class EnglishLuceneMorphTest {
|
|
||||||
private LuceneMorphology luceneMorph;
|
|
||||||
|
|
||||||
@Before
|
|
||||||
public void setUp() throws IOException {
|
|
||||||
luceneMorph = new LuceneMorphology(this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void shoudGetCorrentMorphInfo() throws IOException {
|
|
||||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/english-morphology-test.txt");
|
|
||||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
|
||||||
String s = bufferedReader.readLine();
|
|
||||||
while (s != null) {
|
|
||||||
String[] qa = s.trim().split(" ");
|
|
||||||
Set<String> result = new HashSet<String>();
|
|
||||||
for (int i = 1; i < qa.length; i++) {
|
|
||||||
result.add(qa[i]);
|
|
||||||
}
|
|
||||||
Set<String> stringList = new HashSet<String>(luceneMorph.getNormalForms(qa[0]));
|
|
||||||
assertThat(stringList, equalTo(result));
|
|
||||||
s = bufferedReader.readLine();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,8 +0,0 @@
|
|||||||
purchases purchase
|
|
||||||
existing exist
|
|
||||||
was be
|
|
||||||
men man
|
|
||||||
bore bore bear
|
|
||||||
grown grow grown
|
|
||||||
came come
|
|
||||||
md md
|
|
@ -1 +0,0 @@
|
|||||||
following follow the instruction exactly will be help ensure the best well good result
|
|
@ -1 +0,0 @@
|
|||||||
Following the instructions exactly will help ensure the best results
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.morphology;
|
|||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
@ -26,7 +25,7 @@ public class MorphologyImpl implements Morphology {
|
|||||||
protected int[][] separators;
|
protected int[][] separators;
|
||||||
protected short[] rulesId;
|
protected short[] rulesId;
|
||||||
protected Heuristic[][] rules;
|
protected Heuristic[][] rules;
|
||||||
protected String[] grammaInfo;
|
protected String[] grammarInfo;
|
||||||
protected LetterDecoderEncoder decoderEncoder;
|
protected LetterDecoderEncoder decoderEncoder;
|
||||||
|
|
||||||
|
|
||||||
@ -40,27 +39,11 @@ public class MorphologyImpl implements Morphology {
|
|||||||
this.decoderEncoder = decoderEncoder;
|
this.decoderEncoder = decoderEncoder;
|
||||||
}
|
}
|
||||||
|
|
||||||
public MorphologyImpl(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
|
public MorphologyImpl(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammarInfo) {
|
||||||
this.separators = separators;
|
this.separators = separators;
|
||||||
this.rulesId = rulesId;
|
this.rulesId = rulesId;
|
||||||
this.rules = rules;
|
this.rules = rules;
|
||||||
this.grammaInfo = grammaInfo;
|
this.grammarInfo = grammarInfo;
|
||||||
}
|
|
||||||
|
|
||||||
public int[][] getSeparators() {
|
|
||||||
return separators;
|
|
||||||
}
|
|
||||||
|
|
||||||
public short[] getRulesId() {
|
|
||||||
return rulesId;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Heuristic[][] getRules() {
|
|
||||||
return rules;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String[] getGrammaInfo() {
|
|
||||||
return grammaInfo;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getNormalForms(String s) {
|
public List<String> getNormalForms(String s) {
|
||||||
@ -78,7 +61,7 @@ public class MorphologyImpl implements Morphology {
|
|||||||
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
|
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
|
||||||
int ruleId = findRuleId(ints);
|
int ruleId = findRuleId(ints);
|
||||||
for (Heuristic h : rules[rulesId[ruleId]]) {
|
for (Heuristic h : rules[rulesId[ruleId]]) {
|
||||||
result.add(h.transformWord(s) + "|" + grammaInfo[h.getFormMorphInfo()]);
|
result.add(h.transformWord(s) + "|" + grammarInfo[h.getFormMorphInfo()]);
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
@ -135,8 +118,8 @@ public class MorphologyImpl implements Morphology {
|
|||||||
writer.write(heuristic.toString() + "\n");
|
writer.write(heuristic.toString() + "\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
writer.write(grammaInfo.length + "\n");
|
writer.write(grammarInfo.length + "\n");
|
||||||
for (String s : grammaInfo) {
|
for (String s : grammarInfo) {
|
||||||
writer.write(s + "\n");
|
writer.write(s + "\n");
|
||||||
}
|
}
|
||||||
writer.close();
|
writer.close();
|
||||||
@ -166,9 +149,9 @@ public class MorphologyImpl implements Morphology {
|
|||||||
Integer amount;
|
Integer amount;
|
||||||
s = bufferedReader.readLine();
|
s = bufferedReader.readLine();
|
||||||
amount = Integer.valueOf(s);
|
amount = Integer.valueOf(s);
|
||||||
grammaInfo = new String[amount];
|
grammarInfo = new String[amount];
|
||||||
for (int i = 0; i < amount; i++) {
|
for (int i = 0; i < amount; i++) {
|
||||||
grammaInfo[i] = bufferedReader.readLine();
|
grammarInfo[i] = bufferedReader.readLine();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -48,6 +48,6 @@ public class MorphologyAnalyzer extends Analyzer {
|
|||||||
TokenStream result = new StandardTokenizer(Version.LUCENE_30, reader);
|
TokenStream result = new StandardTokenizer(Version.LUCENE_30, reader);
|
||||||
result = new StandardFilter(result);
|
result = new StandardFilter(result);
|
||||||
result = new LowerCaseFilter(result);
|
result = new LowerCaseFilter(result);
|
||||||
return new MorphlogyFilter(result, luceneMorph);
|
return new MorphologyFilter(result, luceneMorph);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -25,12 +25,12 @@ import java.io.IOException;
|
|||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
|
||||||
public class MorphlogyFilter extends TokenFilter {
|
public class MorphologyFilter extends TokenFilter {
|
||||||
private LuceneMorphology luceneMorph;
|
private LuceneMorphology luceneMorph;
|
||||||
private Iterator<String> iterator;
|
private Iterator<String> iterator;
|
||||||
private TermAttribute termAtt;
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
public MorphlogyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
|
public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
|
||||||
super(tokenStream);
|
super(tokenStream);
|
||||||
this.luceneMorph = luceneMorph;
|
this.luceneMorph = luceneMorph;
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
termAtt = addAttribute(TermAttribute.class);
|
@ -1,59 +0,0 @@
|
|||||||
/**
|
|
||||||
* Copyright 2009 Alexander Kuznetsov
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.morphology.russian;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import static org.hamcrest.Matchers.equalTo;
|
|
||||||
import static org.junit.Assert.assertThat;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashSet;
|
|
||||||
|
|
||||||
|
|
||||||
public class RussianAnalayzerTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void shoudGiveCorretWords() throws IOException {
|
|
||||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-answer.txt");
|
|
||||||
BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
|
||||||
String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
|
|
||||||
HashSet<String> answer = new HashSet<String>(Arrays.asList(strings));
|
|
||||||
stream.close();
|
|
||||||
|
|
||||||
RussianAnalyzer morphlogyAnalyzer = new RussianAnalyzer();
|
|
||||||
stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-data.txt");
|
|
||||||
|
|
||||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
|
||||||
|
|
||||||
TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
|
|
||||||
HashSet<String> result = new HashSet<String>();
|
|
||||||
while (tokenStream.incrementToken()) {
|
|
||||||
TermAttribute attribute1 = tokenStream.getAttribute(TermAttribute.class);
|
|
||||||
result.add(attribute1.term());
|
|
||||||
}
|
|
||||||
|
|
||||||
stream.close();
|
|
||||||
|
|
||||||
assertThat(result, equalTo(answer));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,55 +0,0 @@
|
|||||||
/**
|
|
||||||
* Copyright 2009 Alexander Kuznetsov
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.morphology.russian;
|
|
||||||
|
|
||||||
import org.apache.lucene.morphology.LuceneMorphology;
|
|
||||||
import static org.hamcrest.Matchers.equalTo;
|
|
||||||
import static org.junit.Assert.assertThat;
|
|
||||||
import org.junit.Before;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
public class RussianLuceneMorphTest {
|
|
||||||
private LuceneMorphology luceneMorph;
|
|
||||||
|
|
||||||
@Before
|
|
||||||
public void setUp() throws IOException {
|
|
||||||
luceneMorph = new LuceneMorphology(this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void shoudGetCorrentMorphInfo() throws IOException {
|
|
||||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-morphology-test.txt");
|
|
||||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
|
||||||
String s = bufferedReader.readLine();
|
|
||||||
while (s != null) {
|
|
||||||
String[] qa = s.trim().split(" ");
|
|
||||||
Set<String> result = new HashSet<String>();
|
|
||||||
for (int i = 1; i < qa.length; i++) {
|
|
||||||
result.add(qa[i]);
|
|
||||||
}
|
|
||||||
Set<String> stringList = new HashSet<String>(luceneMorph.getNormalForms(qa[0]));
|
|
||||||
assertThat(stringList, equalTo(result));
|
|
||||||
s = bufferedReader.readLine();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1 +0,0 @@
|
|||||||
в результат крушение погибнуть командир отряд специальный назначение пря при переть гувд ростовский область полковник милиция михаил перов и предприниматель
|
|
@ -1 +0,0 @@
|
|||||||
В результате крушения погибли командир отряда специального назначения при ГУВД Ростовской области полковник милиции Михаил Перов и предприниматель
|
|
@ -1,19 +0,0 @@
|
|||||||
еду еда ехать
|
|
||||||
тестов тест
|
|
||||||
вина вино вина
|
|
||||||
вино вино
|
|
||||||
ехать ехать
|
|
||||||
ананасов ананас ананасовый
|
|
||||||
сухой сухой
|
|
||||||
дураков дурак
|
|
||||||
пушка пушка пушок
|
|
||||||
пушок пушок
|
|
||||||
пушек пушка
|
|
||||||
козлов козлов козловый козел
|
|
||||||
жуков жуков жук
|
|
||||||
красив красить красивый
|
|
||||||
красивая красивый
|
|
||||||
тосклив тоскливый
|
|
||||||
лучший хороший
|
|
||||||
на на
|
|
||||||
тест тест тесто
|
|
Loading…
x
Reference in New Issue
Block a user