fixing some spelling errors

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@100 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
Alexander.A.Kuznetsov 2010-10-08 12:18:18 +00:00
parent 76e68a11e0
commit 3de894404c
25 changed files with 55 additions and 342 deletions

View File

@ -42,25 +42,18 @@ public class DictionaryReader {
this.filters = filters;
}
public DictionaryReader(String fileName, String fileEncoding, Set<String> ignoredForm, List<WordFilter> filters) {
this.fileName = fileName;
this.fileEncoding = fileEncoding;
this.ignoredForm = ignoredForm;
this.filters = filters;
}
public void proccess(WordProccessor wordProccessor) throws IOException {
public void process(WordProcessor wordProcessor) throws IOException {
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), fileEncoding));
readFlexias(bufferedReader);
sckipBlock(bufferedReader);
sckipBlock(bufferedReader);
skipBlock(bufferedReader);
skipBlock(bufferedReader);
readPrefix(bufferedReader);
readWords(bufferedReader, wordProccessor);
readWords(bufferedReader, wordProcessor);
}
private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
private void readWords(BufferedReader reader, WordProcessor wordProcessor) throws IOException {
String s = reader.readLine();
int count = Integer.valueOf(s);
int actual = 0;
@ -79,7 +72,7 @@ public class DictionaryReader {
continue;
}
wordProccessor.process(card);
wordProcessor.process(card);
actual++;
}
@ -106,11 +99,11 @@ public class DictionaryReader {
}
private void sckipBlock(BufferedReader reader) throws IOException {
private void skipBlock(BufferedReader reader) throws IOException {
String s = reader.readLine();
int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) {
s = reader.readLine();
reader.readLine();
}
}

View File

@ -25,19 +25,19 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
//todo spleet this class on two.
public class GrammaReader {
public class GrammarReader {
private String fileName;
private String fileEncoding = "windows-1251";
private List<String> grammaInfo = new ArrayList<String>();
private List<String> grammarInfo = new ArrayList<String>();
private Map<String, Integer> inverseIndex = new HashMap<String, Integer>();
public GrammaReader(String fileName) throws IOException {
public GrammarReader(String fileName) throws IOException {
this.fileName = fileName;
setUp();
}
public GrammaReader(String fileName, String fileEncoding) throws IOException {
public GrammarReader(String fileName, String fileEncoding) throws IOException {
this.fileName = fileName;
this.fileEncoding = fileEncoding;
setUp();
@ -50,27 +50,23 @@ public class GrammaReader {
line = line.trim();
if (!line.startsWith("//") && line.length() > 0) {
String[] strings = line.split(" ", 2);
Integer i = grammaInfo.size();
Integer i = grammarInfo.size();
inverseIndex.put(strings[0], i);
grammaInfo.add(i, strings[1]);
grammarInfo.add(i, strings[1]);
}
line = bufferedReader.readLine();
}
}
public List<String> getGrammaInfo() {
return grammaInfo;
public List<String> getGrammarInfo() {
return grammarInfo;
}
public String[] getGrammaInfoAsArray() {
return grammaInfo.toArray(new String[grammaInfo.size()]);
public String[] getGrammarInfoAsArray() {
return grammarInfo.toArray(new String[grammarInfo.size()]);
}
public Map<String, Integer> getGrammInversIndex() {
public Map<String, Integer> getGrammarInverseIndex() {
return inverseIndex;
}
public void setInverseIndex(Map<String, Integer> inverseIndex) {
this.inverseIndex = inverseIndex;
}
}

View File

@ -26,16 +26,16 @@ import java.util.*;
//todo made refactoring this class
public class StatisticsCollector implements WordProccessor {
public class StatisticsCollector implements WordProcessor {
private TreeMap<String, Set<Heuristic>> inverseIndex = new TreeMap<String, Set<Heuristic>>();
private Map<Set<Heuristic>, Integer> ruleInverseIndex = new HashMap<Set<Heuristic>, Integer>();
private List<Set<Heuristic>> rules = new ArrayList<Set<Heuristic>>();
private GrammaReader grammaReader;
private GrammarReader grammarReader;
private LetterDecoderEncoder decoderEncoder;
public StatisticsCollector(GrammaReader grammaReader, LetterDecoderEncoder decoderEncoder) {
this.grammaReader = grammaReader;
public StatisticsCollector(GrammarReader grammarReader, LetterDecoderEncoder decoderEncoder) {
this.grammarReader = grammarReader;
this.decoderEncoder = decoderEncoder;
}
@ -115,7 +115,7 @@ public class StatisticsCollector implements WordProccessor {
prevSet = currentSet;
}
}
MorphologyImpl morphology = new MorphologyImpl(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray());
MorphologyImpl morphology = new MorphologyImpl(ints, rulesId, heuristics, grammarReader.getGrammarInfoAsArray());
morphology.writeToFile(fileName);
}
@ -134,8 +134,8 @@ public class StatisticsCollector implements WordProccessor {
Integer length = getCommonLength(form, normalForm);
Integer actualSuffixLengh = form.length() - length;
String actualNormalSuffix = normalForm.substring(length);
Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode());
Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm);
Integer integer = grammarReader.getGrammarInverseIndex().get(fm.getCode());
Integer nf = grammarReader.getGrammarInverseIndex().get(normalSuffixForm);
return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue());
}

View File

@ -20,9 +20,8 @@ import java.io.IOException;
/**
* Interface allows get information from
* {@org.apache.lucene.russian.morphology.dictonary.DirtonaryReader}.
*/
public interface WordProccessor {
public interface WordProcessor {
public void process(WordCard wordCard) throws IOException;
}

View File

@ -28,14 +28,14 @@ import java.util.List;
public class EnglishHeuristicBuilder {
public static void main(String[] args) throws IOException {
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/egramtab.tab");
GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/egramtab.tab");
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>(), filters);
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
dictionaryReader.proccess(statisticsCollector);
StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder);
dictionaryReader.process(statisticsCollector);
statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
}

View File

@ -27,14 +27,14 @@ import java.util.List;
public class RussianHeuristicBuilder {
public static void main(String[] args) throws IOException {
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/rgramtab.tab");
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>(), filters);
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
dictionaryReader.proccess(statisticsCollector);
StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder);
dictionaryReader.process(statisticsCollector);
statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");
}

View File

@ -67,9 +67,9 @@ public class TestAllWords {
}
private void testFullGramma(final Morphology morphology, LetterDecoderEncoder decoderEncoder, String pathToGramma, String pathToDict) throws IOException {
GrammaReader grammaInfo = new GrammaReader(pathToGramma);
final List<String> morphInfo = grammaInfo.getGrammaInfo();
final Map<String, Integer> inversIndex = grammaInfo.getGrammInversIndex();
GrammarReader grammarInfo = new GrammarReader(pathToGramma);
final List<String> morphInfo = grammarInfo.getGrammarInfo();
final Map<String, Integer> inversIndex = grammarInfo.getGrammarInverseIndex();
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
@ -79,7 +79,7 @@ public class TestAllWords {
final AtomicLong wordCount = new AtomicLong(0);
Long startTime = System.currentTimeMillis();
dictionaryReader.proccess(new WordProccessor() {
dictionaryReader.process(new WordProcessor() {
public void process(WordCard wordCard) throws IOException {
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
for (FlexiaModel fm : wordCard.getWordsForms()) {
@ -125,7 +125,7 @@ public class TestAllWords {
Long startTime = System.currentTimeMillis();
DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<String>(), filters);
dictionaryReader.proccess(new WordProccessor() {
dictionaryReader.process(new WordProcessor() {
public void process(WordCard wordCard) throws IOException {
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
for (FlexiaModel fm : wordCard.getWordsForms()) {

View File

@ -32,13 +32,13 @@ import static org.hamcrest.Matchers.equalTo;
import static org.junit.Assert.assertThat;
public class AnalayzersTest {
public class AnalyzersTest {
@Test
public void englishAnalyzerShouldGiveCorrectWords() throws IOException {
Analyzer morphlogyAnalyzer = new EnglishAnalyzer();
String answerPath = "/english/englsih-analayzer-answer.txt";
String testPath = "/english/englsih-analayzer-data.txt";
String answerPath = "/english/english-analyzer-answer.txt";
String testPath = "/english/english-analyzer-data.txt";
testAnalayzer(morphlogyAnalyzer, answerPath, testPath);
}
@ -46,8 +46,8 @@ public class AnalayzersTest {
@Test
public void shoudGiveCorretWords() throws IOException {
Analyzer morphlogyAnalyzer = new RussianAnalyzer();
String answerPath = "/russian/russian-analayzer-answer.txt";
String testPath = "/russian/russian-analayzer-data.txt";
String answerPath = "/russian/russian-analyzer-answer.txt";
String testPath = "/russian/russian-analyzer-data.txt";
testAnalayzer(morphlogyAnalyzer, answerPath, testPath);
}

View File

@ -1,58 +0,0 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.english;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import static org.hamcrest.Matchers.equalTo;
import static org.junit.Assert.assertThat;
import org.junit.Test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashSet;
public class EnglishAnalayzerTest {
@Test
public void shouldGiveCorrectWords() throws IOException {
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-answer.txt");
BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
HashSet<String> answer = new HashSet<String>(Arrays.asList(strings));
stream.close();
EnglishAnalyzer morphlogyAnalyzer = new EnglishAnalyzer();
stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-data.txt");
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
HashSet<String> result = new HashSet<String>();
while (tokenStream.incrementToken()) {
TermAttribute attribute1 = tokenStream.getAttribute(TermAttribute.class);
result.add(attribute1.term());
}
stream.close();
assertThat(result, equalTo(answer));
}
}

View File

@ -1,55 +0,0 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.english;
import org.apache.lucene.morphology.LuceneMorphology;
import static org.hamcrest.Matchers.equalTo;
import static org.junit.Assert.assertThat;
import org.junit.Before;
import org.junit.Test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Set;
public class EnglishLuceneMorphTest {
private LuceneMorphology luceneMorph;
@Before
public void setUp() throws IOException {
luceneMorph = new LuceneMorphology(this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
}
@Test
public void shoudGetCorrentMorphInfo() throws IOException {
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/english-morphology-test.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
String s = bufferedReader.readLine();
while (s != null) {
String[] qa = s.trim().split(" ");
Set<String> result = new HashSet<String>();
for (int i = 1; i < qa.length; i++) {
result.add(qa[i]);
}
Set<String> stringList = new HashSet<String>(luceneMorph.getNormalForms(qa[0]));
assertThat(stringList, equalTo(result));
s = bufferedReader.readLine();
}
}
}

View File

@ -1,8 +0,0 @@
purchases purchase
existing exist
was be
men man
bore bore bear
grown grow grown
came come
md md

View File

@ -1 +0,0 @@
following follow the instruction exactly will be help ensure the best well good result

View File

@ -1 +0,0 @@
Following the instructions exactly will help ensure the best results

View File

@ -18,7 +18,6 @@ package org.apache.lucene.morphology;
import java.io.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
@ -26,7 +25,7 @@ public class MorphologyImpl implements Morphology {
protected int[][] separators;
protected short[] rulesId;
protected Heuristic[][] rules;
protected String[] grammaInfo;
protected String[] grammarInfo;
protected LetterDecoderEncoder decoderEncoder;
@ -40,27 +39,11 @@ public class MorphologyImpl implements Morphology {
this.decoderEncoder = decoderEncoder;
}
public MorphologyImpl(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
public MorphologyImpl(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammarInfo) {
this.separators = separators;
this.rulesId = rulesId;
this.rules = rules;
this.grammaInfo = grammaInfo;
}
public int[][] getSeparators() {
return separators;
}
public short[] getRulesId() {
return rulesId;
}
public Heuristic[][] getRules() {
return rules;
}
public String[] getGrammaInfo() {
return grammaInfo;
this.grammarInfo = grammarInfo;
}
public List<String> getNormalForms(String s) {
@ -78,7 +61,7 @@ public class MorphologyImpl implements Morphology {
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
int ruleId = findRuleId(ints);
for (Heuristic h : rules[rulesId[ruleId]]) {
result.add(h.transformWord(s) + "|" + grammaInfo[h.getFormMorphInfo()]);
result.add(h.transformWord(s) + "|" + grammarInfo[h.getFormMorphInfo()]);
}
return result;
}
@ -135,8 +118,8 @@ public class MorphologyImpl implements Morphology {
writer.write(heuristic.toString() + "\n");
}
}
writer.write(grammaInfo.length + "\n");
for (String s : grammaInfo) {
writer.write(grammarInfo.length + "\n");
for (String s : grammarInfo) {
writer.write(s + "\n");
}
writer.close();
@ -166,9 +149,9 @@ public class MorphologyImpl implements Morphology {
Integer amount;
s = bufferedReader.readLine();
amount = Integer.valueOf(s);
grammaInfo = new String[amount];
grammarInfo = new String[amount];
for (int i = 0; i < amount; i++) {
grammaInfo[i] = bufferedReader.readLine();
grammarInfo[i] = bufferedReader.readLine();
}
}

View File

@ -48,6 +48,6 @@ public class MorphologyAnalyzer extends Analyzer {
TokenStream result = new StandardTokenizer(Version.LUCENE_30, reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
return new MorphlogyFilter(result, luceneMorph);
return new MorphologyFilter(result, luceneMorph);
}
}

View File

@ -25,12 +25,12 @@ import java.io.IOException;
import java.util.Iterator;
public class MorphlogyFilter extends TokenFilter {
public class MorphologyFilter extends TokenFilter {
private LuceneMorphology luceneMorph;
private Iterator<String> iterator;
private TermAttribute termAtt;
public MorphlogyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
super(tokenStream);
this.luceneMorph = luceneMorph;
termAtt = addAttribute(TermAttribute.class);

View File

@ -1,59 +0,0 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.russian;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import static org.hamcrest.Matchers.equalTo;
import static org.junit.Assert.assertThat;
import org.junit.Test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashSet;
public class RussianAnalayzerTest {
@Test
public void shoudGiveCorretWords() throws IOException {
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-answer.txt");
BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
HashSet<String> answer = new HashSet<String>(Arrays.asList(strings));
stream.close();
RussianAnalyzer morphlogyAnalyzer = new RussianAnalyzer();
stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-data.txt");
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
HashSet<String> result = new HashSet<String>();
while (tokenStream.incrementToken()) {
TermAttribute attribute1 = tokenStream.getAttribute(TermAttribute.class);
result.add(attribute1.term());
}
stream.close();
assertThat(result, equalTo(answer));
}
}

View File

@ -1,55 +0,0 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.russian;
import org.apache.lucene.morphology.LuceneMorphology;
import static org.hamcrest.Matchers.equalTo;
import static org.junit.Assert.assertThat;
import org.junit.Before;
import org.junit.Test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Set;
public class RussianLuceneMorphTest {
private LuceneMorphology luceneMorph;
@Before
public void setUp() throws IOException {
luceneMorph = new LuceneMorphology(this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());
}
@Test
public void shoudGetCorrentMorphInfo() throws IOException {
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-morphology-test.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
String s = bufferedReader.readLine();
while (s != null) {
String[] qa = s.trim().split(" ");
Set<String> result = new HashSet<String>();
for (int i = 1; i < qa.length; i++) {
result.add(qa[i]);
}
Set<String> stringList = new HashSet<String>(luceneMorph.getNormalForms(qa[0]));
assertThat(stringList, equalTo(result));
s = bufferedReader.readLine();
}
}
}

View File

@ -1 +0,0 @@
в результат крушение погибнуть командир отряд специальный назначение пря при переть гувд ростовский область полковник милиция михаил перов и предприниматель

View File

@ -1 +0,0 @@
В результате крушения погибли командир отряда специального назначения при ГУВД Ростовской области полковник милиции Михаил Перов и предприниматель

View File

@ -1,19 +0,0 @@
еду еда ехать
тестов тест
вина вино вина
вино вино
ехать ехать
ананасов ананас ананасовый
сухой сухой
дураков дурак
пушка пушка пушок
пушок пушок
пушек пушка
козлов козлов козловый козел
жуков жуков жук
красив красить красивый
красивая красивый
тосклив тоскливый
лучший хороший
на на
тест тест тесто