fixing typo
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@98 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
7bf8ef7d6f
commit
e8399999c3
@ -28,22 +28,22 @@ import java.util.*;
|
||||
* This class contain logic how read
|
||||
* dictonary and produce word with it all forms.
|
||||
*/
|
||||
public class DictonaryReader {
|
||||
public class DictionaryReader {
|
||||
private String fileName;
|
||||
private String fileEncoding = "windows-1251";
|
||||
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
|
||||
private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
|
||||
private Set<String> ingnoredForm = new HashSet<String>();
|
||||
private Set<String> ignoredForm = new HashSet<String>();
|
||||
|
||||
public DictonaryReader(String fileName, Set<String> ingnoredForm) {
|
||||
public DictionaryReader(String fileName, Set<String> ignoredForm) {
|
||||
this.fileName = fileName;
|
||||
this.ingnoredForm = ingnoredForm;
|
||||
this.ignoredForm = ignoredForm;
|
||||
}
|
||||
|
||||
public DictonaryReader(String fileName, String fileEncoding, Set<String> ingnoredForm) {
|
||||
public DictionaryReader(String fileName, String fileEncoding, Set<String> ignoredForm) {
|
||||
this.fileName = fileName;
|
||||
this.fileEncoding = fileEncoding;
|
||||
this.ingnoredForm = ingnoredForm;
|
||||
this.ignoredForm = ignoredForm;
|
||||
}
|
||||
|
||||
|
||||
@ -70,7 +70,7 @@ public class DictonaryReader {
|
||||
wordBase = "#".equals(wordBase) ? "" : wordBase;
|
||||
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
|
||||
FlexiaModel flexiaModel = models.get(0);
|
||||
if (models.size() > 0 && !ingnoredForm.contains(flexiaModel.getCode())) {
|
||||
if (models.size() > 0 && !ignoredForm.contains(flexiaModel.getCode())) {
|
||||
|
||||
WordCard card = new WordCard(flexiaModel.create(wordBase), wordBase, flexiaModel.getSuffix());
|
||||
for (FlexiaModel fm : models) {
|
@ -30,7 +30,7 @@ public class GrammaReader {
|
||||
private String fileName;
|
||||
private String fileEncoding = "windows-1251";
|
||||
private List<String> grammaInfo = new ArrayList<String>();
|
||||
private Map<String, Integer> inversIndex = new HashMap<String, Integer>();
|
||||
private Map<String, Integer> inverseIndex = new HashMap<String, Integer>();
|
||||
|
||||
public GrammaReader(String fileName) throws IOException {
|
||||
this.fileName = fileName;
|
||||
@ -51,7 +51,7 @@ public class GrammaReader {
|
||||
if (!line.startsWith("//") && line.length() > 0) {
|
||||
String[] strings = line.split(" ", 2);
|
||||
Integer i = grammaInfo.size();
|
||||
inversIndex.put(strings[0], i);
|
||||
inverseIndex.put(strings[0], i);
|
||||
grammaInfo.add(i, strings[1]);
|
||||
}
|
||||
line = bufferedReader.readLine();
|
||||
@ -67,10 +67,10 @@ public class GrammaReader {
|
||||
}
|
||||
|
||||
public Map<String, Integer> getGrammInversIndex() {
|
||||
return inversIndex;
|
||||
return inverseIndex;
|
||||
}
|
||||
|
||||
public void setInversIndex(Map<String, Integer> inversIndex) {
|
||||
this.inversIndex = inversIndex;
|
||||
public void setInverseIndex(Map<String, Integer> inverseIndex) {
|
||||
this.inverseIndex = inverseIndex;
|
||||
}
|
||||
}
|
||||
|
@ -41,12 +41,12 @@ public class StatisticsCollector implements WordProccessor {
|
||||
|
||||
public void process(WordCard wordCard) throws IOException {
|
||||
cleanWordCard(wordCard);
|
||||
String normalStringMorph = wordCard.getWordsFroms().get(0).getCode();
|
||||
String normalStringMorph = wordCard.getWordsForms().get(0).getCode();
|
||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||
if (word.contains("-")) return;
|
||||
if (!decoderEncoder.checkString(word)) return;
|
||||
|
||||
for (FlexiaModel fm : wordCard.getWordsFroms()) {
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) continue;
|
||||
Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph);
|
||||
String form = revertWord(fm.create(wordCard.getBase()));
|
||||
@ -61,9 +61,9 @@ public class StatisticsCollector implements WordProccessor {
|
||||
|
||||
private void cleanWordCard(WordCard wordCard) {
|
||||
wordCard.setBase(cleanString(wordCard.getBase()));
|
||||
wordCard.setCanonicalFrom(cleanString(wordCard.getCanonicalFrom()));
|
||||
wordCard.setCanonicalForm(cleanString(wordCard.getCanonicalForm()));
|
||||
wordCard.setCanonicalSuffix(cleanString(wordCard.getCanonicalSuffix()));
|
||||
List<FlexiaModel> models = wordCard.getWordsFroms();
|
||||
List<FlexiaModel> models = wordCard.getWordsForms();
|
||||
for (FlexiaModel m : models) {
|
||||
m.setSuffix(cleanString(m.getSuffix()));
|
||||
m.setPrefix(cleanString(m.getPrefix()));
|
||||
|
@ -23,23 +23,23 @@ import java.util.List;
|
||||
* Represent word and all it forms.
|
||||
*/
|
||||
public class WordCard {
|
||||
private String canonicalFrom;
|
||||
private String canonicalForm;
|
||||
private String base;
|
||||
private String canonicalSuffix;
|
||||
private List<FlexiaModel> wordsFroms = new ArrayList<FlexiaModel>();
|
||||
private List<FlexiaModel> wordsForms = new ArrayList<FlexiaModel>();
|
||||
|
||||
public WordCard(String canonicalFrom, String base, String canonicalSuffix) {
|
||||
this.canonicalFrom = canonicalFrom;
|
||||
public WordCard(String canonicalForm, String base, String canonicalSuffix) {
|
||||
this.canonicalForm = canonicalForm;
|
||||
this.canonicalSuffix = canonicalSuffix;
|
||||
this.base = base;
|
||||
}
|
||||
|
||||
public void addFlexia(FlexiaModel flexiaModel) {
|
||||
wordsFroms.add(flexiaModel);
|
||||
wordsForms.add(flexiaModel);
|
||||
}
|
||||
|
||||
public String getCanonicalFrom() {
|
||||
return canonicalFrom;
|
||||
public String getCanonicalForm() {
|
||||
return canonicalForm;
|
||||
}
|
||||
|
||||
public String getCanonicalSuffix() {
|
||||
@ -50,12 +50,12 @@ public class WordCard {
|
||||
return base;
|
||||
}
|
||||
|
||||
public List<FlexiaModel> getWordsFroms() {
|
||||
return wordsFroms;
|
||||
public List<FlexiaModel> getWordsForms() {
|
||||
return wordsForms;
|
||||
}
|
||||
|
||||
public void setCanonicalFrom(String canonicalFrom) {
|
||||
this.canonicalFrom = canonicalFrom;
|
||||
public void setCanonicalForm(String canonicalForm) {
|
||||
this.canonicalForm = canonicalForm;
|
||||
}
|
||||
|
||||
public void setBase(String base) {
|
||||
@ -66,17 +66,17 @@ public class WordCard {
|
||||
this.canonicalSuffix = canonicalSuffix;
|
||||
}
|
||||
|
||||
public void setWordsFroms(List<FlexiaModel> wordsFroms) {
|
||||
this.wordsFroms = wordsFroms;
|
||||
public void setWordsForms(List<FlexiaModel> wordsForms) {
|
||||
this.wordsForms = wordsForms;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "WordCard{" +
|
||||
"canonicalFrom='" + canonicalFrom + '\'' +
|
||||
"canonicalForm='" + canonicalForm + '\'' +
|
||||
", base='" + base + '\'' +
|
||||
", canonicalSuffix='" + canonicalSuffix + '\'' +
|
||||
", wordsFroms=" + wordsFroms +
|
||||
", wordsForms=" + wordsForms +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
@ -16,7 +16,7 @@
|
||||
|
||||
package org.apache.lucene.morphology.generator;
|
||||
|
||||
import org.apache.lucene.morphology.dictionary.DictonaryReader;
|
||||
import org.apache.lucene.morphology.dictionary.DictionaryReader;
|
||||
import org.apache.lucene.morphology.dictionary.GrammaReader;
|
||||
import org.apache.lucene.morphology.dictionary.StatisticsCollector;
|
||||
import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder;
|
||||
@ -29,11 +29,11 @@ public class EnglishHeuristicBuilder {
|
||||
public static void main(String[] args) throws IOException {
|
||||
|
||||
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/egramtab.tab");
|
||||
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());
|
||||
|
||||
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
|
||||
dictonaryReader.proccess(statisticsCollector);
|
||||
dictionaryReader.proccess(statisticsCollector);
|
||||
statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
|
||||
|
||||
}
|
||||
|
@ -16,7 +16,7 @@
|
||||
|
||||
package org.apache.lucene.morphology.generator;
|
||||
|
||||
import org.apache.lucene.morphology.dictionary.DictonaryReader;
|
||||
import org.apache.lucene.morphology.dictionary.DictionaryReader;
|
||||
import org.apache.lucene.morphology.dictionary.GrammaReader;
|
||||
import org.apache.lucene.morphology.dictionary.StatisticsCollector;
|
||||
import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
|
||||
@ -28,11 +28,11 @@ import java.util.HashSet;
|
||||
public class RussianHeuristicBuilder {
|
||||
public static void main(String[] args) throws IOException {
|
||||
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
|
||||
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
|
||||
|
||||
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
|
||||
dictonaryReader.proccess(statisticsCollector);
|
||||
dictionaryReader.proccess(statisticsCollector);
|
||||
statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");
|
||||
|
||||
}
|
||||
|
@ -15,14 +15,14 @@
|
||||
*/
|
||||
package org.apache.lucene.morphology.english;
|
||||
|
||||
import org.apache.lucene.morphology.analayzer.MorphlogyAnalayzer;
|
||||
import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
public class EnglishAnalayzer extends MorphlogyAnalayzer {
|
||||
public class EnglishAnalyzer extends MorphologyAnalyzer {
|
||||
|
||||
public EnglishAnalayzer() throws IOException {
|
||||
public EnglishAnalyzer() throws IOException {
|
||||
super(new EnglishLuceneMorphology());
|
||||
}
|
||||
|
@ -39,12 +39,12 @@ public class EnglishAnalayzerTest {
|
||||
HashSet<String> answer = new HashSet<String>(Arrays.asList(strings));
|
||||
stream.close();
|
||||
|
||||
EnglishAnalayzer morphlogyAnalayzer = new EnglishAnalayzer();
|
||||
EnglishAnalyzer morphlogyAnalyzer = new EnglishAnalyzer();
|
||||
stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-data.txt");
|
||||
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
|
||||
TokenStream tokenStream = morphlogyAnalayzer.tokenStream(null, reader);
|
||||
TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
while (tokenStream.incrementToken()) {
|
||||
TermAttribute attribute1 = tokenStream.getAttribute(TermAttribute.class);
|
||||
|
@ -22,6 +22,6 @@ public interface Morphology {
|
||||
|
||||
List<String> getNormalForms(String s);
|
||||
|
||||
List<String> getMorfInfo(String s);
|
||||
List<String> getMorphInfo(String s);
|
||||
|
||||
}
|
||||
|
@ -73,7 +73,7 @@ public class MorphologyImpl implements Morphology {
|
||||
return result;
|
||||
}
|
||||
|
||||
public List<String> getMorfInfo(String s) {
|
||||
public List<String> getMorphInfo(String s) {
|
||||
ArrayList<String> result = new ArrayList<String>();
|
||||
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
|
||||
int ruleId = findRuleId(ints);
|
||||
|
@ -14,7 +14,7 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.morphology.analayzer;
|
||||
package org.apache.lucene.morphology.analyzer;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
@ -14,7 +14,7 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.morphology.analayzer;
|
||||
package org.apache.lucene.morphology.analyzer;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
@ -29,18 +29,18 @@ import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
|
||||
public class MorphlogyAnalayzer extends Analyzer {
|
||||
public class MorphologyAnalyzer extends Analyzer {
|
||||
private LuceneMorphology luceneMorph;
|
||||
|
||||
public MorphlogyAnalayzer(LuceneMorphology luceneMorph) {
|
||||
public MorphologyAnalyzer(LuceneMorphology luceneMorph) {
|
||||
this.luceneMorph = luceneMorph;
|
||||
}
|
||||
|
||||
public MorphlogyAnalayzer(String pathToMorph, LetterDecoderEncoder letterDecoderEncoder) throws IOException {
|
||||
public MorphologyAnalyzer(String pathToMorph, LetterDecoderEncoder letterDecoderEncoder) throws IOException {
|
||||
luceneMorph = new LuceneMorphology(pathToMorph, letterDecoderEncoder);
|
||||
}
|
||||
|
||||
public MorphlogyAnalayzer(InputStream inputStream, LetterDecoderEncoder letterDecoderEncoder) throws IOException {
|
||||
public MorphologyAnalyzer(InputStream inputStream, LetterDecoderEncoder letterDecoderEncoder) throws IOException {
|
||||
luceneMorph = new LuceneMorphology(inputStream, letterDecoderEncoder);
|
||||
}
|
||||
|
@ -15,13 +15,13 @@
|
||||
*/
|
||||
package org.apache.lucene.morphology.russian;
|
||||
|
||||
import org.apache.lucene.morphology.analayzer.MorphlogyAnalayzer;
|
||||
import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
public class RussianAnalayzer extends MorphlogyAnalayzer {
|
||||
public RussianAnalayzer() throws IOException {
|
||||
public class RussianAnalyzer extends MorphologyAnalyzer {
|
||||
public RussianAnalyzer() throws IOException {
|
||||
super(new RussianLuceneMorphology());
|
||||
}
|
||||
}
|
@ -39,12 +39,12 @@ public class RussianAnalayzerTest {
|
||||
HashSet<String> answer = new HashSet<String>(Arrays.asList(strings));
|
||||
stream.close();
|
||||
|
||||
RussianAnalayzer morphlogyAnalayzer = new RussianAnalayzer();
|
||||
RussianAnalyzer morphlogyAnalyzer = new RussianAnalyzer();
|
||||
stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-data.txt");
|
||||
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
|
||||
TokenStream tokenStream = morphlogyAnalayzer.tokenStream(null, reader);
|
||||
TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
while (tokenStream.incrementToken()) {
|
||||
TermAttribute attribute1 = tokenStream.getAttribute(TermAttribute.class);
|
||||
|
Loading…
x
Reference in New Issue
Block a user