fixing some spelling errors
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@100 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
@ -42,25 +42,18 @@ public class DictionaryReader {
|
||||
this.filters = filters;
|
||||
}
|
||||
|
||||
public DictionaryReader(String fileName, String fileEncoding, Set<String> ignoredForm, List<WordFilter> filters) {
|
||||
this.fileName = fileName;
|
||||
this.fileEncoding = fileEncoding;
|
||||
this.ignoredForm = ignoredForm;
|
||||
this.filters = filters;
|
||||
}
|
||||
|
||||
|
||||
public void proccess(WordProccessor wordProccessor) throws IOException {
|
||||
public void process(WordProcessor wordProcessor) throws IOException {
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), fileEncoding));
|
||||
readFlexias(bufferedReader);
|
||||
sckipBlock(bufferedReader);
|
||||
sckipBlock(bufferedReader);
|
||||
skipBlock(bufferedReader);
|
||||
skipBlock(bufferedReader);
|
||||
readPrefix(bufferedReader);
|
||||
readWords(bufferedReader, wordProccessor);
|
||||
readWords(bufferedReader, wordProcessor);
|
||||
}
|
||||
|
||||
|
||||
private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
|
||||
private void readWords(BufferedReader reader, WordProcessor wordProcessor) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.valueOf(s);
|
||||
int actual = 0;
|
||||
@ -79,7 +72,7 @@ public class DictionaryReader {
|
||||
continue;
|
||||
}
|
||||
|
||||
wordProccessor.process(card);
|
||||
wordProcessor.process(card);
|
||||
actual++;
|
||||
|
||||
}
|
||||
@ -106,11 +99,11 @@ public class DictionaryReader {
|
||||
}
|
||||
|
||||
|
||||
private void sckipBlock(BufferedReader reader) throws IOException {
|
||||
private void skipBlock(BufferedReader reader) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.valueOf(s);
|
||||
for (int i = 0; i < count; i++) {
|
||||
s = reader.readLine();
|
||||
reader.readLine();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -25,19 +25,19 @@ import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
//todo spleet this class on two.
|
||||
public class GrammaReader {
|
||||
|
||||
public class GrammarReader {
|
||||
private String fileName;
|
||||
private String fileEncoding = "windows-1251";
|
||||
private List<String> grammaInfo = new ArrayList<String>();
|
||||
private List<String> grammarInfo = new ArrayList<String>();
|
||||
private Map<String, Integer> inverseIndex = new HashMap<String, Integer>();
|
||||
|
||||
public GrammaReader(String fileName) throws IOException {
|
||||
public GrammarReader(String fileName) throws IOException {
|
||||
this.fileName = fileName;
|
||||
setUp();
|
||||
}
|
||||
|
||||
public GrammaReader(String fileName, String fileEncoding) throws IOException {
|
||||
public GrammarReader(String fileName, String fileEncoding) throws IOException {
|
||||
this.fileName = fileName;
|
||||
this.fileEncoding = fileEncoding;
|
||||
setUp();
|
||||
@ -50,27 +50,23 @@ public class GrammaReader {
|
||||
line = line.trim();
|
||||
if (!line.startsWith("//") && line.length() > 0) {
|
||||
String[] strings = line.split(" ", 2);
|
||||
Integer i = grammaInfo.size();
|
||||
Integer i = grammarInfo.size();
|
||||
inverseIndex.put(strings[0], i);
|
||||
grammaInfo.add(i, strings[1]);
|
||||
grammarInfo.add(i, strings[1]);
|
||||
}
|
||||
line = bufferedReader.readLine();
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> getGrammaInfo() {
|
||||
return grammaInfo;
|
||||
public List<String> getGrammarInfo() {
|
||||
return grammarInfo;
|
||||
}
|
||||
|
||||
public String[] getGrammaInfoAsArray() {
|
||||
return grammaInfo.toArray(new String[grammaInfo.size()]);
|
||||
public String[] getGrammarInfoAsArray() {
|
||||
return grammarInfo.toArray(new String[grammarInfo.size()]);
|
||||
}
|
||||
|
||||
public Map<String, Integer> getGrammInversIndex() {
|
||||
public Map<String, Integer> getGrammarInverseIndex() {
|
||||
return inverseIndex;
|
||||
}
|
||||
|
||||
public void setInverseIndex(Map<String, Integer> inverseIndex) {
|
||||
this.inverseIndex = inverseIndex;
|
||||
}
|
||||
}
|
@ -26,16 +26,16 @@ import java.util.*;
|
||||
|
||||
|
||||
//todo made refactoring this class
|
||||
public class StatisticsCollector implements WordProccessor {
|
||||
public class StatisticsCollector implements WordProcessor {
|
||||
private TreeMap<String, Set<Heuristic>> inverseIndex = new TreeMap<String, Set<Heuristic>>();
|
||||
private Map<Set<Heuristic>, Integer> ruleInverseIndex = new HashMap<Set<Heuristic>, Integer>();
|
||||
private List<Set<Heuristic>> rules = new ArrayList<Set<Heuristic>>();
|
||||
private GrammaReader grammaReader;
|
||||
private GrammarReader grammarReader;
|
||||
private LetterDecoderEncoder decoderEncoder;
|
||||
|
||||
|
||||
public StatisticsCollector(GrammaReader grammaReader, LetterDecoderEncoder decoderEncoder) {
|
||||
this.grammaReader = grammaReader;
|
||||
public StatisticsCollector(GrammarReader grammarReader, LetterDecoderEncoder decoderEncoder) {
|
||||
this.grammarReader = grammarReader;
|
||||
this.decoderEncoder = decoderEncoder;
|
||||
}
|
||||
|
||||
@ -115,7 +115,7 @@ public class StatisticsCollector implements WordProccessor {
|
||||
prevSet = currentSet;
|
||||
}
|
||||
}
|
||||
MorphologyImpl morphology = new MorphologyImpl(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray());
|
||||
MorphologyImpl morphology = new MorphologyImpl(ints, rulesId, heuristics, grammarReader.getGrammarInfoAsArray());
|
||||
morphology.writeToFile(fileName);
|
||||
}
|
||||
|
||||
@ -134,8 +134,8 @@ public class StatisticsCollector implements WordProccessor {
|
||||
Integer length = getCommonLength(form, normalForm);
|
||||
Integer actualSuffixLengh = form.length() - length;
|
||||
String actualNormalSuffix = normalForm.substring(length);
|
||||
Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode());
|
||||
Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm);
|
||||
Integer integer = grammarReader.getGrammarInverseIndex().get(fm.getCode());
|
||||
Integer nf = grammarReader.getGrammarInverseIndex().get(normalSuffixForm);
|
||||
return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue());
|
||||
}
|
||||
|
||||
|
@ -20,9 +20,8 @@ import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Interface allows get information from
|
||||
* {@org.apache.lucene.russian.morphology.dictonary.DirtonaryReader}.
|
||||
*/
|
||||
public interface WordProccessor {
|
||||
public interface WordProcessor {
|
||||
|
||||
public void process(WordCard wordCard) throws IOException;
|
||||
}
|
@ -28,14 +28,14 @@ import java.util.List;
|
||||
public class EnglishHeuristicBuilder {
|
||||
public static void main(String[] args) throws IOException {
|
||||
|
||||
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/egramtab.tab");
|
||||
GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/egramtab.tab");
|
||||
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
||||
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
||||
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>(), filters);
|
||||
|
||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
|
||||
dictionaryReader.proccess(statisticsCollector);
|
||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder);
|
||||
dictionaryReader.process(statisticsCollector);
|
||||
statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
|
||||
|
||||
}
|
||||
|
@ -27,14 +27,14 @@ import java.util.List;
|
||||
|
||||
public class RussianHeuristicBuilder {
|
||||
public static void main(String[] args) throws IOException {
|
||||
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
|
||||
GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/rgramtab.tab");
|
||||
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
||||
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
||||
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>(), filters);
|
||||
|
||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
|
||||
dictionaryReader.proccess(statisticsCollector);
|
||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder);
|
||||
dictionaryReader.process(statisticsCollector);
|
||||
statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");
|
||||
|
||||
}
|
||||
|
@ -67,9 +67,9 @@ public class TestAllWords {
|
||||
}
|
||||
|
||||
private void testFullGramma(final Morphology morphology, LetterDecoderEncoder decoderEncoder, String pathToGramma, String pathToDict) throws IOException {
|
||||
GrammaReader grammaInfo = new GrammaReader(pathToGramma);
|
||||
final List<String> morphInfo = grammaInfo.getGrammaInfo();
|
||||
final Map<String, Integer> inversIndex = grammaInfo.getGrammInversIndex();
|
||||
GrammarReader grammarInfo = new GrammarReader(pathToGramma);
|
||||
final List<String> morphInfo = grammarInfo.getGrammarInfo();
|
||||
final Map<String, Integer> inversIndex = grammarInfo.getGrammarInverseIndex();
|
||||
|
||||
List<WordFilter> filters = Arrays.asList(new WordStringCleaner(decoderEncoder), new WordCleaner(decoderEncoder));
|
||||
|
||||
@ -79,7 +79,7 @@ public class TestAllWords {
|
||||
final AtomicLong wordCount = new AtomicLong(0);
|
||||
Long startTime = System.currentTimeMillis();
|
||||
|
||||
dictionaryReader.proccess(new WordProccessor() {
|
||||
dictionaryReader.process(new WordProcessor() {
|
||||
public void process(WordCard wordCard) throws IOException {
|
||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
@ -125,7 +125,7 @@ public class TestAllWords {
|
||||
Long startTime = System.currentTimeMillis();
|
||||
|
||||
DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<String>(), filters);
|
||||
dictionaryReader.proccess(new WordProccessor() {
|
||||
dictionaryReader.process(new WordProcessor() {
|
||||
public void process(WordCard wordCard) throws IOException {
|
||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
|
@ -32,13 +32,13 @@ import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.junit.Assert.assertThat;
|
||||
|
||||
|
||||
public class AnalayzersTest {
|
||||
public class AnalyzersTest {
|
||||
|
||||
@Test
|
||||
public void englishAnalyzerShouldGiveCorrectWords() throws IOException {
|
||||
Analyzer morphlogyAnalyzer = new EnglishAnalyzer();
|
||||
String answerPath = "/english/englsih-analayzer-answer.txt";
|
||||
String testPath = "/english/englsih-analayzer-data.txt";
|
||||
String answerPath = "/english/english-analyzer-answer.txt";
|
||||
String testPath = "/english/english-analyzer-data.txt";
|
||||
|
||||
testAnalayzer(morphlogyAnalyzer, answerPath, testPath);
|
||||
}
|
||||
@ -46,8 +46,8 @@ public class AnalayzersTest {
|
||||
@Test
|
||||
public void shoudGiveCorretWords() throws IOException {
|
||||
Analyzer morphlogyAnalyzer = new RussianAnalyzer();
|
||||
String answerPath = "/russian/russian-analayzer-answer.txt";
|
||||
String testPath = "/russian/russian-analayzer-data.txt";
|
||||
String answerPath = "/russian/russian-analyzer-answer.txt";
|
||||
String testPath = "/russian/russian-analyzer-data.txt";
|
||||
|
||||
testAnalayzer(morphlogyAnalyzer, answerPath, testPath);
|
||||
}
|
Reference in New Issue
Block a user