start working on new version with morphology info

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@37 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov 2009-08-11 06:05:03 +00:00
parent 214a8e2ebe
commit e4dd3a7a76
12 changed files with 249 additions and 58 deletions

View File

@ -18,15 +18,18 @@ package org.apache.lucene.russian.morphology;
import org.apache.lucene.russian.morphology.dictonary.DictonaryReader; import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
import org.apache.lucene.russian.morphology.dictonary.FrequentyReader; import org.apache.lucene.russian.morphology.dictonary.FrequentyReader;
import org.apache.lucene.russian.morphology.dictonary.GrammaReader;
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader; import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
import org.apache.lucene.russian.morphology.heuristic.Heuristic; import org.apache.lucene.russian.morphology.heuristic.HeuristicBySuffixLegth;
import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors; import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors;
import org.apache.lucene.russian.morphology.heuristic.SuffixCounter; import org.apache.lucene.russian.morphology.heuristic.SuffixCounter;
import org.apache.lucene.russian.morphology.heuristic.SuffixHeuristic;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.Set; import java.util.Set;
import java.util.TreeMap;
public class HeuristicBuilder { public class HeuristicBuilder {
@ -35,10 +38,11 @@ public class HeuristicBuilder {
Set<String> form = formReader.getIngnoredFroms(); Set<String> form = formReader.getIngnoredFroms();
FrequentyReader frequentyReader = new FrequentyReader("data/lemma.num"); FrequentyReader frequentyReader = new FrequentyReader("data/lemma.num");
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form); DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form);
StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read());
StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read(), grammaInfo);
dictonaryReader.proccess(statiticsCollectors); dictonaryReader.proccess(statiticsCollectors);
Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values(); Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
Object[] objects = counterCollection.toArray(); Object[] objects = counterCollection.toArray();
@ -48,11 +52,52 @@ public class HeuristicBuilder {
System.out.println(objects[i]); System.out.println(objects[i]);
} }
final Heuristic heuristic = new Heuristic(); final HeuristicBySuffixLegth heuristic = new HeuristicBySuffixLegth();
for (int i = 0; i < objects.length; i++) { for (int i = 0; i < objects.length; i++) {
heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic()); heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic());
} }
heuristic.writeToFile("russianSuffixesHeuristic.txt"); TreeMap<Integer, Integer> map = new TreeMap<Integer, Integer>();
int ct = 0;
for (Set<SuffixHeuristic> s : heuristic.getHeuristics().values()) {
Integer d = map.get(s.size());
map.put(s.size(), 1 + (d == null ? 0 : d));
if (s.size() == 1) {
ct++;
continue;
}
SuffixHeuristic heuristic1 = s.iterator().next();
Integer sufixSize = heuristic1.getActualSuffixLength();
String normalSuffix = heuristic1.getNormalFromSuffix();
if (heuristic1.getFormSuffix().length() < 6) {
ct++;
continue;
}
Boolean flag = true;
if (sufixSize > 3) continue;
for (SuffixHeuristic sh : s) {
flag = flag && (sufixSize.equals(sh.getActualSuffixLength()))
&& (normalSuffix.equals(sh.getNormalFromSuffix()));
}
if (flag) {
System.out.println(s);
ct++;
}
//HashSet<String> integers = new HashSet<String>();
// for(SuffixHeuristic sh:s){
// integers.add(sh.getMorphInfoCode());
// }
// if(s.size() == integers.size()){
// ct++;
// }else{
// if(s.size() == 2) System.out.println(s);
// }
}
System.out.println(objects.length);
System.out.println(heuristic.getHeuristics().size());
System.out.println(ct);
System.out.println(map);
//heuristic.writeToFile("russianSuffixesHeuristic.txt");
} }
} }

View File

@ -24,7 +24,7 @@ package org.apache.lucene.russian.morphology;
*/ */
public class RussianSuffixDecoderEncoder { public class RussianSuffixDecoderEncoder {
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071; public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
public static final int SUFFIX_LENGTH = 7; public static final int SUFFIX_LENGTH = 6;
public static final int EE_CHAR = 34; public static final int EE_CHAR = 34;
public static final int E_CHAR = 6; public static final int E_CHAR = 6;
public static final int DASH_CHAR = 45; public static final int DASH_CHAR = 45;

View File

@ -0,0 +1,13 @@
package org.apache.lucene.russian.morphology;
import org.apache.lucene.russian.morphology.dictonary.GrammaReader;
import java.io.IOException;
public class Test {
public static void main(String[] args) throws IOException {
GrammaReader grammaReader = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
System.out.println(grammaReader.getInversIndex().size());
}
}

View File

@ -63,17 +63,18 @@ public class DictonaryReader {
int count = Integer.valueOf(s); int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
s = reader.readLine(); s = reader.readLine();
if (i % 10000 == 0) System.out.println("Proccess " + i + " word of " + count); if (i % 10000 == 0) System.out.println("Proccess " + i + " wordBase of " + count);
String[] wd = s.split(" "); String[] wd = s.split(" ");
String word = wd[0].toLowerCase(); String wordBase = wd[0].toLowerCase();
if (word.startsWith("-")) continue; if (wordBase.startsWith("-")) continue;
word = "#".equals(word) ? "" : word; wordBase = "#".equals(wordBase) ? "" : wordBase;
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1])); List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
if (models.size() > 0 && !ingnoredForm.contains(models.get(0).getCode())) { FlexiaModel flexiaModel = models.get(0);
WordCard card = new WordCard(cleanString(models.get(0).create(word))); if (models.size() > 0 && !ingnoredForm.contains(flexiaModel.getCode())) {
WordCard card = new WordCard(cleanString(flexiaModel.create(wordBase)), cleanString(wordBase), flexiaModel.getSuffix());
for (FlexiaModel fm : models) { for (FlexiaModel fm : models) {
card.addFrom(cleanString(fm.create(word))); card.addFlexia(fm);
} }
wordProccessor.proccess(card); wordProccessor.proccess(card);
} }
@ -118,9 +119,10 @@ public class DictonaryReader {
private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) { private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
String[] fl = line.split("\\*"); String[] fl = line.split("\\*");
// we inored all forms thats // we inored all forms thats
// if (fl.length == 3) if (fl.length == 3) {
// flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase())); flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase())));
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), "")); }
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), ""));
} }
} }

View File

@ -0,0 +1,58 @@
package org.apache.lucene.russian.morphology.dictonary;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
//todo spleet this class on two.
public class GrammaReader {
private String fileName;
private String fileEncoding = "windows-1251";
private Map<Integer, String> grammaInfo = new HashMap<Integer, String>();
private Map<String, Integer> inversIndex = new HashMap<String, Integer>();
public GrammaReader(String fileName) throws IOException {
this.fileName = fileName;
setUp();
}
public GrammaReader(String fileName, String fileEncoding) throws IOException {
this.fileName = fileName;
this.fileEncoding = fileEncoding;
setUp();
}
private void setUp() throws IOException {
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), fileEncoding));
String line = bufferedReader.readLine();
while (line != null) {
line = line.trim();
if (!line.startsWith("//") && line.length() > 0) {
String[] strings = line.split(" ", 2);
Integer i = grammaInfo.size();
inversIndex.put(strings[0], i);
grammaInfo.put(i, strings[1]);
}
line = bufferedReader.readLine();
}
}
public Map<Integer, String> getGrammaInfo() {
return grammaInfo;
}
public void setGrammaInfo(Map<Integer, String> grammaInfo) {
this.grammaInfo = grammaInfo;
}
public Map<String, Integer> getInversIndex() {
return inversIndex;
}
public void setInversIndex(Map<String, Integer> inversIndex) {
this.inversIndex = inversIndex;
}
}

View File

@ -24,21 +24,33 @@ import java.util.List;
*/ */
public class WordCard { public class WordCard {
private String canonicalFrom; private String canonicalFrom;
private List<String> wordsFroms = new ArrayList<String>(); private String base;
private String canonicalSuffix;
private List<FlexiaModel> wordsFroms = new ArrayList<FlexiaModel>();
protected WordCard(String canonicalFrom) { public WordCard(String canonicalFrom, String base, String canonicalSuffix) {
this.canonicalFrom = canonicalFrom; this.canonicalFrom = canonicalFrom;
this.canonicalSuffix = canonicalSuffix;
this.base = base;
} }
protected void addFrom(String word) { public void addFlexia(FlexiaModel flexiaModel) {
wordsFroms.add(word); wordsFroms.add(flexiaModel);
} }
public String getCanonicalFrom() { public String getCanonicalFrom() {
return canonicalFrom; return canonicalFrom;
} }
public List<String> getWordsFroms() { public String getCanonicalSuffix() {
return canonicalSuffix;
}
public String getBase() {
return base;
}
public List<FlexiaModel> getWordsFroms() {
return wordsFroms; return wordsFroms;
} }
} }

View File

@ -29,11 +29,11 @@ public class Heuristic {
private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>(); private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();
public void addHeuristic(SuffixHeuristic suffixHeuristic) { public void addHeuristic(SuffixHeuristic suffixHeuristic) {
Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix()); // Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
Long longs = encodedSuffixesPairs.get(suffix); // Long longs = encodedSuffixesPairs.get(suffix);
if (longs == null) { // if (longs == null) {
encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixHeuristic.getNormalSuffix())); // encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixHeuristic.getNormalSuffix()));
} // }
} }
public String getNormalForm(String form) { public String getNormalForm(String form) {
@ -49,6 +49,10 @@ public class Heuristic {
return form; return form;
} }
public Integer getAmount() {
return encodedSuffixesPairs.size();
}
public void readFromFile(String file) throws IOException { public void readFromFile(String file) throws IOException {
BufferedReader reader = new BufferedReader(new FileReader(file)); BufferedReader reader = new BufferedReader(new FileReader(file));
String s = reader.readLine(); String s = reader.readLine();

View File

@ -0,0 +1,27 @@
package org.apache.lucene.russian.morphology.heuristic;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
public class HeuristicBySuffixLegth {
private Map<Long, Set<SuffixHeuristic>> heuristics = new HashMap<Long, Set<SuffixHeuristic>>();
public void addHeuristic(SuffixHeuristic suffixHeuristic) {
Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
Set<SuffixHeuristic> suffixHeuristics = heuristics.get(suffix);
if (suffixHeuristics == null) {
suffixHeuristics = new HashSet<SuffixHeuristic>();
heuristics.put(suffix, suffixHeuristics);
}
suffixHeuristics.add(suffixHeuristic);
}
public Map<Long, Set<SuffixHeuristic>> getHeuristics() {
return heuristics;
}
}

View File

@ -17,6 +17,8 @@
package org.apache.lucene.russian.morphology.heuristic; package org.apache.lucene.russian.morphology.heuristic;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import org.apache.lucene.russian.morphology.dictonary.FlexiaModel;
import org.apache.lucene.russian.morphology.dictonary.GrammaReader;
import org.apache.lucene.russian.morphology.dictonary.WordCard; import org.apache.lucene.russian.morphology.dictonary.WordCard;
import org.apache.lucene.russian.morphology.dictonary.WordProccessor; import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
@ -27,16 +29,18 @@ import java.util.Map;
public class StatiticsCollectors implements WordProccessor { public class StatiticsCollectors implements WordProccessor {
Map<SuffixHeuristic, SuffixCounter> statititics = new HashMap<SuffixHeuristic, SuffixCounter>(); Map<SuffixHeuristic, SuffixCounter> statititics = new HashMap<SuffixHeuristic, SuffixCounter>();
private Map<String, Double> wordsFreq; private Map<String, Double> wordsFreq;
private GrammaReader grammaInfo;
public StatiticsCollectors(Map<String, Double> wordsFreq) { public StatiticsCollectors(Map<String, Double> wordsFreq, GrammaReader grammaInfo) {
this.wordsFreq = wordsFreq; this.wordsFreq = wordsFreq;
this.grammaInfo = grammaInfo;
} }
private Integer ignoredCount = 0; private Integer ignoredCount = 0;
public void proccess(WordCard wordCard) { public void proccess(WordCard wordCard) {
for (String form : wordCard.getWordsFroms()) { for (FlexiaModel fm : wordCard.getWordsFroms()) {
SuffixHeuristic suffixHeuristic = createEvristic(wordCard.getCanonicalFrom(), form); SuffixHeuristic suffixHeuristic = createEvristic(wordCard.getCanonicalFrom(), wordCard.getCanonicalSuffix(), fm);
if (suffixHeuristic == null) continue; if (suffixHeuristic == null) continue;
SuffixCounter suffixCounter = statititics.get(suffixHeuristic); SuffixCounter suffixCounter = statititics.get(suffixHeuristic);
if (suffixCounter == null) { if (suffixCounter == null) {
@ -57,19 +61,23 @@ public class StatiticsCollectors implements WordProccessor {
return statititics; return statititics;
} }
private SuffixHeuristic createEvristic(String word, String form) { private SuffixHeuristic createEvristic(String wordBase, String canonicalSuffix, FlexiaModel fm) {
String form = fm.create(wordBase);
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0; int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
String formSuffix = form.substring(startSymbol); String formSuffix = form.substring(startSymbol);
if (word.length() < startSymbol) { String actualSuffix = fm.getSuffix();
ignoredCount++; Integer actualSuffixLengh = actualSuffix.length();
return null; // if (word.length() < startSymbol) {
} // ignoredCount++;
String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : ""; // return null;
if (wordSuffix.length() > 12) { // }
System.out.println(word + " " + form); // String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : "";
return null; // if (wordSuffix.length() > 12) {
} // System.out.println(word + " " + form);
return new SuffixHeuristic(formSuffix, wordSuffix); // return null;
// }
// return new SuffixHeuristic(formSuffix, wordSuffix);
return new SuffixHeuristic(formSuffix, actualSuffixLengh, canonicalSuffix, fm.getCode());
} }

View File

@ -24,27 +24,31 @@ package org.apache.lucene.russian.morphology.heuristic;
*/ */
public class SuffixHeuristic { public class SuffixHeuristic {
private String formSuffix; private String formSuffix;
private String normalSuffix; private Integer actualSuffixLength;
private String normalFromSuffix;
private String morphInfoCode;
public SuffixHeuristic(String formSuffix, String normalSuffix) { public SuffixHeuristic(String formSuffix, Integer actualSuffixLength, String normalFromSuffix, String morphInfoCode) {
this.formSuffix = formSuffix; this.formSuffix = formSuffix;
this.normalSuffix = normalSuffix; this.actualSuffixLength = actualSuffixLength;
this.normalFromSuffix = normalFromSuffix;
this.morphInfoCode = morphInfoCode;
} }
public String getFormSuffix() { public String getFormSuffix() {
return formSuffix; return formSuffix;
} }
public void setFormSuffix(String formSuffix) { public Integer getActualSuffixLength() {
this.formSuffix = formSuffix; return actualSuffixLength;
} }
public String getNormalSuffix() { public String getNormalFromSuffix() {
return normalSuffix; return normalFromSuffix;
} }
public void setNormalSuffix(String normalSuffix) { public String getMorphInfoCode() {
this.normalSuffix = normalSuffix; return morphInfoCode;
} }
@Override @Override
@ -54,24 +58,28 @@ public class SuffixHeuristic {
SuffixHeuristic that = (SuffixHeuristic) o; SuffixHeuristic that = (SuffixHeuristic) o;
if (!formSuffix.equals(that.formSuffix)) return false; if (actualSuffixLength != null ? !actualSuffixLength.equals(that.actualSuffixLength) : that.actualSuffixLength != null)
if (!normalSuffix.equals(that.normalSuffix)) return false; return false;
if (formSuffix != null ? !formSuffix.equals(that.formSuffix) : that.formSuffix != null) return false;
if (morphInfoCode != null ? !morphInfoCode.equals(that.morphInfoCode) : that.morphInfoCode != null)
return false;
if (normalFromSuffix != null ? !normalFromSuffix.equals(that.normalFromSuffix) : that.normalFromSuffix != null)
return false;
return true; return true;
} }
@Override @Override
public int hashCode() { public int hashCode() {
int result = formSuffix.hashCode(); int result = formSuffix != null ? formSuffix.hashCode() : 0;
result = 31 * result + normalSuffix.hashCode(); result = 31 * result + (actualSuffixLength != null ? actualSuffixLength.hashCode() : 0);
result = 31 * result + (normalFromSuffix != null ? normalFromSuffix.hashCode() : 0);
result = 31 * result + (morphInfoCode != null ? morphInfoCode.hashCode() : 0);
return result; return result;
} }
@Override @Override
public String toString() { public String toString() {
return "SuffixHeuristic{" + return formSuffix + " " + actualSuffixLength + " " + normalFromSuffix + " " + morphInfoCode;
"formSuffix='" + formSuffix + '\'' +
", normalSuffix='" + normalSuffix + '\'' +
'}';
} }
} }

View File

@ -0,0 +1,13 @@
package org.apache.lucene.russian.morphology.heuristic;
public class SuffixHeuristicMerger {
public SuffixHeuristic merge(SuffixHeuristic one, SuffixHeuristic two) {
if (!one.getMorphInfoCode().equals(two.getMorphInfoCode()))
return null;
SuffixHeuristic min = one.getActualSuffixLength() > two.getActualSuffixLength() ? two : one;
return null;
}
}

View File

@ -6,3 +6,4 @@
произошло произойти произошло произойти
test test test test
ананасов ананас ананасов ананас
встовашего встовать