fixing problem with - in word the english morphology not work correctly because word forms contains it
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@97 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
3ca9eb2cb9
commit
7bf8ef7d6f
@ -71,11 +71,17 @@ public class DictonaryReader {
|
||||
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
|
||||
FlexiaModel flexiaModel = models.get(0);
|
||||
if (models.size() > 0 && !ingnoredForm.contains(flexiaModel.getCode())) {
|
||||
|
||||
WordCard card = new WordCard(flexiaModel.create(wordBase), wordBase, flexiaModel.getSuffix());
|
||||
for (FlexiaModel fm : models) {
|
||||
card.addFlexia(fm);
|
||||
}
|
||||
wordProccessor.proccess(card);
|
||||
// if(card.getBase().equals("face") || card.getBase().equals("fac")){
|
||||
// System.out.println(models);
|
||||
// System.out.println(card);
|
||||
wordProccessor.process(card);
|
||||
//}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -17,7 +17,7 @@
|
||||
package org.apache.lucene.morphology.dictionary;
|
||||
|
||||
/**
|
||||
* Represent inofrmation of how word form created form it imutible part.
|
||||
* Represent information of how word form created form it imutible part.
|
||||
*/
|
||||
public class FlexiaModel {
|
||||
private String code;
|
||||
@ -60,6 +60,10 @@ public class FlexiaModel {
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return prefix + " " + suffix;
|
||||
return "FlexiaModel{" +
|
||||
"code='" + code + '\'' +
|
||||
", suffix='" + suffix + '\'' +
|
||||
", prefix='" + prefix + '\'' +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
@ -26,20 +26,20 @@ import java.util.*;
|
||||
|
||||
|
||||
//todo made refactoring this class
|
||||
public class StatiticsCollector implements WordProccessor {
|
||||
private TreeMap<String, Set<Heuristic>> inversIndex = new TreeMap<String, Set<Heuristic>>();
|
||||
private Map<Set<Heuristic>, Integer> ruleInverIndex = new HashMap<Set<Heuristic>, Integer>();
|
||||
public class StatisticsCollector implements WordProccessor {
|
||||
private TreeMap<String, Set<Heuristic>> inverseIndex = new TreeMap<String, Set<Heuristic>>();
|
||||
private Map<Set<Heuristic>, Integer> ruleInverseIndex = new HashMap<Set<Heuristic>, Integer>();
|
||||
private List<Set<Heuristic>> rules = new ArrayList<Set<Heuristic>>();
|
||||
private GrammaReader grammaReader;
|
||||
private LetterDecoderEncoder decoderEncoder;
|
||||
|
||||
|
||||
public StatiticsCollector(GrammaReader grammaReader, LetterDecoderEncoder decoderEncoder) {
|
||||
public StatisticsCollector(GrammaReader grammaReader, LetterDecoderEncoder decoderEncoder) {
|
||||
this.grammaReader = grammaReader;
|
||||
this.decoderEncoder = decoderEncoder;
|
||||
}
|
||||
|
||||
public void proccess(WordCard wordCard) throws IOException {
|
||||
public void process(WordCard wordCard) throws IOException {
|
||||
cleanWordCard(wordCard);
|
||||
String normalStringMorph = wordCard.getWordsFroms().get(0).getCode();
|
||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||
@ -47,13 +47,13 @@ public class StatiticsCollector implements WordProccessor {
|
||||
if (!decoderEncoder.checkString(word)) return;
|
||||
|
||||
for (FlexiaModel fm : wordCard.getWordsFroms()) {
|
||||
if (!decoderEncoder.checkString(fm.create(wordCard.getBase()))) continue;
|
||||
if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) continue;
|
||||
Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph);
|
||||
String form = revertWord(fm.create(wordCard.getBase()));
|
||||
Set<Heuristic> suffixHeuristics = inversIndex.get(form);
|
||||
Set<Heuristic> suffixHeuristics = inverseIndex.get(form);
|
||||
if (suffixHeuristics == null) {
|
||||
suffixHeuristics = new HashSet<Heuristic>();
|
||||
inversIndex.put(form, suffixHeuristics);
|
||||
inverseIndex.put(form, suffixHeuristics);
|
||||
}
|
||||
suffixHeuristics.add(heuristic);
|
||||
}
|
||||
@ -76,24 +76,24 @@ public class StatiticsCollector implements WordProccessor {
|
||||
Map<Integer, Integer> dist = new TreeMap<Integer, Integer>();
|
||||
Set<Heuristic> prevSet = null;
|
||||
int count = 0;
|
||||
for (String key : inversIndex.keySet()) {
|
||||
Set<Heuristic> currentSet = inversIndex.get(key);
|
||||
for (String key : inverseIndex.keySet()) {
|
||||
Set<Heuristic> currentSet = inverseIndex.get(key);
|
||||
if (!currentSet.equals(prevSet)) {
|
||||
Integer d = dist.get(key.length());
|
||||
dist.put(key.length(), 1 + (d == null ? 0 : d));
|
||||
prevSet = currentSet;
|
||||
count++;
|
||||
if (!ruleInverIndex.containsKey(currentSet)) {
|
||||
ruleInverIndex.put(currentSet, rules.size());
|
||||
if (!ruleInverseIndex.containsKey(currentSet)) {
|
||||
ruleInverseIndex.put(currentSet, rules.size());
|
||||
rules.add(currentSet);
|
||||
}
|
||||
}
|
||||
}
|
||||
System.out.println("Word with diffirent rules " + count);
|
||||
System.out.println("All ivers words " + inversIndex.size());
|
||||
System.out.println("All ivers words " + inverseIndex.size());
|
||||
System.out.println(dist);
|
||||
System.out.println("diffirent rule count " + ruleInverIndex.size());
|
||||
Heuristic[][] heuristics = new Heuristic[ruleInverIndex.size()][];
|
||||
System.out.println("diffirent rule count " + ruleInverseIndex.size());
|
||||
Heuristic[][] heuristics = new Heuristic[ruleInverseIndex.size()][];
|
||||
int index = 0;
|
||||
for (Set<Heuristic> hs : rules) {
|
||||
heuristics[index] = new Heuristic[hs.size()];
|
||||
@ -109,12 +109,12 @@ public class StatiticsCollector implements WordProccessor {
|
||||
short[] rulesId = new short[count];
|
||||
count = 0;
|
||||
prevSet = null;
|
||||
for (String key : inversIndex.keySet()) {
|
||||
Set<Heuristic> currentSet = inversIndex.get(key);
|
||||
for (String key : inverseIndex.keySet()) {
|
||||
Set<Heuristic> currentSet = inverseIndex.get(key);
|
||||
if (!currentSet.equals(prevSet)) {
|
||||
int[] word = decoderEncoder.encodeToArray(key);
|
||||
ints[count] = word;
|
||||
rulesId[count] = (short) ruleInverIndex.get(currentSet).intValue();
|
||||
rulesId[count] = (short) ruleInverseIndex.get(currentSet).intValue();
|
||||
count++;
|
||||
prevSet = currentSet;
|
||||
}
|
@ -69,4 +69,14 @@ public class WordCard {
|
||||
public void setWordsFroms(List<FlexiaModel> wordsFroms) {
|
||||
this.wordsFroms = wordsFroms;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "WordCard{" +
|
||||
"canonicalFrom='" + canonicalFrom + '\'' +
|
||||
", base='" + base + '\'' +
|
||||
", canonicalSuffix='" + canonicalSuffix + '\'' +
|
||||
", wordsFroms=" + wordsFroms +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
@ -24,5 +24,5 @@ import java.io.IOException;
|
||||
*/
|
||||
public interface WordProccessor {
|
||||
|
||||
public void proccess(WordCard wordCard) throws IOException;
|
||||
public void process(WordCard wordCard) throws IOException;
|
||||
}
|
||||
|
@ -18,7 +18,7 @@ package org.apache.lucene.morphology.generator;
|
||||
|
||||
import org.apache.lucene.morphology.dictionary.DictonaryReader;
|
||||
import org.apache.lucene.morphology.dictionary.GrammaReader;
|
||||
import org.apache.lucene.morphology.dictionary.StatiticsCollector;
|
||||
import org.apache.lucene.morphology.dictionary.StatisticsCollector;
|
||||
import org.apache.lucene.morphology.english.EnglishLetterDecoderEncoder;
|
||||
|
||||
import java.io.IOException;
|
||||
@ -32,9 +32,9 @@ public class EnglishHeuristicBuilder {
|
||||
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());
|
||||
|
||||
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
||||
StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo, decoderEncoder);
|
||||
dictonaryReader.proccess(statiticsCollector);
|
||||
statiticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
|
||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
|
||||
dictonaryReader.proccess(statisticsCollector);
|
||||
statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
|
||||
|
||||
}
|
||||
}
|
@ -18,7 +18,7 @@ package org.apache.lucene.morphology.generator;
|
||||
|
||||
import org.apache.lucene.morphology.dictionary.DictonaryReader;
|
||||
import org.apache.lucene.morphology.dictionary.GrammaReader;
|
||||
import org.apache.lucene.morphology.dictionary.StatiticsCollector;
|
||||
import org.apache.lucene.morphology.dictionary.StatisticsCollector;
|
||||
import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
|
||||
|
||||
import java.io.IOException;
|
||||
@ -31,9 +31,9 @@ public class RussianHeuristicBuilder {
|
||||
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
|
||||
|
||||
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
||||
StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo, decoderEncoder);
|
||||
dictonaryReader.proccess(statiticsCollector);
|
||||
statiticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");
|
||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammaInfo, decoderEncoder);
|
||||
dictonaryReader.proccess(statisticsCollector);
|
||||
statisticsCollector.saveHeuristic("russian/src/main/resources/org/apache/lucene/morphology/russian/morph.info");
|
||||
|
||||
}
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -32,7 +32,7 @@ import java.util.HashSet;
|
||||
public class EnglishAnalayzerTest {
|
||||
|
||||
@Test
|
||||
public void shoudGiveCorretWords() throws IOException {
|
||||
public void shouldGiveCorrectWords() throws IOException {
|
||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-answer.txt");
|
||||
BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||
String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
|
||||
|
@ -19,33 +19,33 @@ import java.io.Serializable;
|
||||
|
||||
|
||||
public class Heuristic implements Serializable {
|
||||
byte actualSuffixLengh;
|
||||
byte actualSuffixLength;
|
||||
String actualNormalSuffix;
|
||||
short formMorphInfo;
|
||||
short normalFormMorphInfo;
|
||||
|
||||
public Heuristic(String s) {
|
||||
String[] strings = s.split("\\|");
|
||||
actualSuffixLengh = Byte.valueOf(strings[0]);
|
||||
actualSuffixLength = Byte.valueOf(strings[0]);
|
||||
actualNormalSuffix = strings[1];
|
||||
formMorphInfo = Short.valueOf(strings[2]);
|
||||
normalFormMorphInfo = Short.valueOf(strings[3]);
|
||||
}
|
||||
|
||||
public Heuristic(byte actualSuffixLengh, String actualNormalSuffix, short formMorphInfo, short normalFormMorphInfo) {
|
||||
this.actualSuffixLengh = actualSuffixLengh;
|
||||
public Heuristic(byte actualSuffixLength, String actualNormalSuffix, short formMorphInfo, short normalFormMorphInfo) {
|
||||
this.actualSuffixLength = actualSuffixLength;
|
||||
this.actualNormalSuffix = actualNormalSuffix;
|
||||
this.formMorphInfo = formMorphInfo;
|
||||
this.normalFormMorphInfo = normalFormMorphInfo;
|
||||
}
|
||||
|
||||
public String transofrmWord(String w) {
|
||||
if (w.length() - actualSuffixLengh < 0) return w;
|
||||
return w.substring(0, w.length() - actualSuffixLengh) + actualNormalSuffix;
|
||||
public String transformWord(String w) {
|
||||
if (w.length() - actualSuffixLength < 0) return w;
|
||||
return w.substring(0, w.length() - actualSuffixLength) + actualNormalSuffix;
|
||||
}
|
||||
|
||||
public byte getActualSuffixLengh() {
|
||||
return actualSuffixLengh;
|
||||
public byte getActualSuffixLength() {
|
||||
return actualSuffixLength;
|
||||
}
|
||||
|
||||
public String getActualNormalSuffix() {
|
||||
@ -67,7 +67,7 @@ public class Heuristic implements Serializable {
|
||||
|
||||
Heuristic heuristic = (Heuristic) o;
|
||||
|
||||
if (actualSuffixLengh != heuristic.actualSuffixLengh) return false;
|
||||
if (actualSuffixLength != heuristic.actualSuffixLength) return false;
|
||||
if (formMorphInfo != heuristic.formMorphInfo) return false;
|
||||
if (normalFormMorphInfo != heuristic.normalFormMorphInfo) return false;
|
||||
if (actualNormalSuffix != null ? !actualNormalSuffix.equals(heuristic.actualNormalSuffix) : heuristic.actualNormalSuffix != null)
|
||||
@ -78,7 +78,7 @@ public class Heuristic implements Serializable {
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = (int) actualSuffixLengh;
|
||||
int result = (int) actualSuffixLength;
|
||||
result = 31 * result + (actualNormalSuffix != null ? actualNormalSuffix.hashCode() : 0);
|
||||
result = 31 * result + (int) formMorphInfo;
|
||||
result = 31 * result + (int) normalFormMorphInfo;
|
||||
@ -87,6 +87,6 @@ public class Heuristic implements Serializable {
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "" + actualSuffixLengh + "|" + actualNormalSuffix + "|" + formMorphInfo + "|" + normalFormMorphInfo;
|
||||
return "" + actualSuffixLength + "|" + actualNormalSuffix + "|" + formMorphInfo + "|" + normalFormMorphInfo;
|
||||
}
|
||||
}
|
||||
|
@ -55,7 +55,7 @@ public class LuceneMorphology extends MorphologyImpl {
|
||||
for (Heuristic heuristic : heuristics) {
|
||||
boolean isAdded = true;
|
||||
for (Heuristic ch : result) {
|
||||
isAdded = isAdded && !(ch.getActualNormalSuffix().equals(heuristic.getActualNormalSuffix()) && (ch.getActualSuffixLengh() == heuristic.getActualSuffixLengh()));
|
||||
isAdded = isAdded && !(ch.getActualNormalSuffix().equals(heuristic.getActualNormalSuffix()) && (ch.getActualSuffixLength() == heuristic.getActualSuffixLength()));
|
||||
}
|
||||
if (isAdded) {
|
||||
result.add(heuristic);
|
||||
|
@ -68,7 +68,7 @@ public class MorphologyImpl implements Morphology {
|
||||
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
|
||||
int ruleId = findRuleId(ints);
|
||||
for (Heuristic h : rules[rulesId[ruleId]]) {
|
||||
result.add(h.transofrmWord(s));
|
||||
result.add(h.transformWord(s));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
@ -78,7 +78,7 @@ public class MorphologyImpl implements Morphology {
|
||||
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
|
||||
int ruleId = findRuleId(ints);
|
||||
for (Heuristic h : rules[rulesId[ruleId]]) {
|
||||
result.add(h.transofrmWord(s) + "|" + grammaInfo[h.getFormMorphInfo()]);
|
||||
result.add(h.transformWord(s) + "|" + grammaInfo[h.getFormMorphInfo()]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
@ -180,9 +180,9 @@ public class MorphologyImpl implements Morphology {
|
||||
rules = new Heuristic[amount][];
|
||||
for (int i = 0; i < amount; i++) {
|
||||
String s1 = bufferedReader.readLine();
|
||||
Integer ruleLenght = Integer.valueOf(s1);
|
||||
rules[i] = new Heuristic[ruleLenght];
|
||||
for (int j = 0; j < ruleLenght; j++) {
|
||||
Integer ruleLength = Integer.valueOf(s1);
|
||||
rules[i] = new Heuristic[ruleLength];
|
||||
for (int j = 0; j < ruleLength; j++) {
|
||||
rules[i][j] = new Heuristic(bufferedReader.readLine());
|
||||
}
|
||||
}
|
||||
@ -197,7 +197,6 @@ public class MorphologyImpl implements Morphology {
|
||||
}
|
||||
|
||||
private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException {
|
||||
HashSet intetger = new HashSet<Integer>();
|
||||
separators = new int[amount][];
|
||||
for (int i = 0; i < amount; i++) {
|
||||
String s1 = bufferedReader.readLine();
|
||||
@ -206,7 +205,6 @@ public class MorphologyImpl implements Morphology {
|
||||
for (int j = 0; j < wordLenght; j++) {
|
||||
separators[i][j] = Integer.valueOf(bufferedReader.readLine());
|
||||
}
|
||||
intetger.add(separators[i][0]);
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user