start working on new version with morphology info

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@37 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov 2009-08-11 06:05:03 +00:00
parent 214a8e2ebe
commit e4dd3a7a76
12 changed files with 249 additions and 58 deletions

View File

@ -18,15 +18,18 @@ package org.apache.lucene.russian.morphology;
import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
import org.apache.lucene.russian.morphology.dictonary.FrequentyReader;
import org.apache.lucene.russian.morphology.dictonary.GrammaReader;
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
import org.apache.lucene.russian.morphology.heuristic.Heuristic;
import org.apache.lucene.russian.morphology.heuristic.HeuristicBySuffixLegth;
import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors;
import org.apache.lucene.russian.morphology.heuristic.SuffixCounter;
import org.apache.lucene.russian.morphology.heuristic.SuffixHeuristic;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Set;
import java.util.TreeMap;
public class HeuristicBuilder {
@ -35,10 +38,11 @@ public class HeuristicBuilder {
Set<String> form = formReader.getIngnoredFroms();
FrequentyReader frequentyReader = new FrequentyReader("data/lemma.num");
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form);
StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read());
StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read(), grammaInfo);
dictonaryReader.proccess(statiticsCollectors);
Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
Object[] objects = counterCollection.toArray();
@ -48,11 +52,52 @@ public class HeuristicBuilder {
System.out.println(objects[i]);
}
final Heuristic heuristic = new Heuristic();
final HeuristicBySuffixLegth heuristic = new HeuristicBySuffixLegth();
for (int i = 0; i < objects.length; i++) {
heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic());
}
heuristic.writeToFile("russianSuffixesHeuristic.txt");
TreeMap<Integer, Integer> map = new TreeMap<Integer, Integer>();
int ct = 0;
for (Set<SuffixHeuristic> s : heuristic.getHeuristics().values()) {
Integer d = map.get(s.size());
map.put(s.size(), 1 + (d == null ? 0 : d));
if (s.size() == 1) {
ct++;
continue;
}
SuffixHeuristic heuristic1 = s.iterator().next();
Integer sufixSize = heuristic1.getActualSuffixLength();
String normalSuffix = heuristic1.getNormalFromSuffix();
if (heuristic1.getFormSuffix().length() < 6) {
ct++;
continue;
}
Boolean flag = true;
if (sufixSize > 3) continue;
for (SuffixHeuristic sh : s) {
flag = flag && (sufixSize.equals(sh.getActualSuffixLength()))
&& (normalSuffix.equals(sh.getNormalFromSuffix()));
}
if (flag) {
System.out.println(s);
ct++;
}
//HashSet<String> integers = new HashSet<String>();
// for(SuffixHeuristic sh:s){
// integers.add(sh.getMorphInfoCode());
// }
// if(s.size() == integers.size()){
// ct++;
// }else{
// if(s.size() == 2) System.out.println(s);
// }
}
System.out.println(objects.length);
System.out.println(heuristic.getHeuristics().size());
System.out.println(ct);
System.out.println(map);
//heuristic.writeToFile("russianSuffixesHeuristic.txt");
}
}

View File

@ -24,7 +24,7 @@ package org.apache.lucene.russian.morphology;
*/
public class RussianSuffixDecoderEncoder {
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
public static final int SUFFIX_LENGTH = 7;
public static final int SUFFIX_LENGTH = 6;
public static final int EE_CHAR = 34;
public static final int E_CHAR = 6;
public static final int DASH_CHAR = 45;

View File

@ -0,0 +1,13 @@
package org.apache.lucene.russian.morphology;
import org.apache.lucene.russian.morphology.dictonary.GrammaReader;
import java.io.IOException;
public class Test {
public static void main(String[] args) throws IOException {
GrammaReader grammaReader = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
System.out.println(grammaReader.getInversIndex().size());
}
}

View File

@ -63,17 +63,18 @@ public class DictonaryReader {
int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) {
s = reader.readLine();
if (i % 10000 == 0) System.out.println("Proccess " + i + " word of " + count);
if (i % 10000 == 0) System.out.println("Proccess " + i + " wordBase of " + count);
String[] wd = s.split(" ");
String word = wd[0].toLowerCase();
if (word.startsWith("-")) continue;
word = "#".equals(word) ? "" : word;
String wordBase = wd[0].toLowerCase();
if (wordBase.startsWith("-")) continue;
wordBase = "#".equals(wordBase) ? "" : wordBase;
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
if (models.size() > 0 && !ingnoredForm.contains(models.get(0).getCode())) {
WordCard card = new WordCard(cleanString(models.get(0).create(word)));
FlexiaModel flexiaModel = models.get(0);
if (models.size() > 0 && !ingnoredForm.contains(flexiaModel.getCode())) {
WordCard card = new WordCard(cleanString(flexiaModel.create(wordBase)), cleanString(wordBase), flexiaModel.getSuffix());
for (FlexiaModel fm : models) {
card.addFrom(cleanString(fm.create(word)));
card.addFlexia(fm);
}
wordProccessor.proccess(card);
}
@ -118,9 +119,10 @@ public class DictonaryReader {
private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
String[] fl = line.split("\\*");
// we inored all forms thats
// if (fl.length == 3)
// flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
if (fl.length == 3) {
flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase())));
}
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), ""));
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.russian.morphology.dictonary;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
//todo spleet this class on two.
public class GrammaReader {
private String fileName;
private String fileEncoding = "windows-1251";
private Map<Integer, String> grammaInfo = new HashMap<Integer, String>();
private Map<String, Integer> inversIndex = new HashMap<String, Integer>();
public GrammaReader(String fileName) throws IOException {
this.fileName = fileName;
setUp();
}
public GrammaReader(String fileName, String fileEncoding) throws IOException {
this.fileName = fileName;
this.fileEncoding = fileEncoding;
setUp();
}
private void setUp() throws IOException {
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), fileEncoding));
String line = bufferedReader.readLine();
while (line != null) {
line = line.trim();
if (!line.startsWith("//") && line.length() > 0) {
String[] strings = line.split(" ", 2);
Integer i = grammaInfo.size();
inversIndex.put(strings[0], i);
grammaInfo.put(i, strings[1]);
}
line = bufferedReader.readLine();
}
}
public Map<Integer, String> getGrammaInfo() {
return grammaInfo;
}
public void setGrammaInfo(Map<Integer, String> grammaInfo) {
this.grammaInfo = grammaInfo;
}
public Map<String, Integer> getInversIndex() {
return inversIndex;
}
public void setInversIndex(Map<String, Integer> inversIndex) {
this.inversIndex = inversIndex;
}
}

View File

@ -24,21 +24,33 @@ import java.util.List;
*/
public class WordCard {
private String canonicalFrom;
private List<String> wordsFroms = new ArrayList<String>();
private String base;
private String canonicalSuffix;
private List<FlexiaModel> wordsFroms = new ArrayList<FlexiaModel>();
protected WordCard(String canonicalFrom) {
public WordCard(String canonicalFrom, String base, String canonicalSuffix) {
this.canonicalFrom = canonicalFrom;
this.canonicalSuffix = canonicalSuffix;
this.base = base;
}
protected void addFrom(String word) {
wordsFroms.add(word);
public void addFlexia(FlexiaModel flexiaModel) {
wordsFroms.add(flexiaModel);
}
public String getCanonicalFrom() {
return canonicalFrom;
}
public List<String> getWordsFroms() {
public String getCanonicalSuffix() {
return canonicalSuffix;
}
public String getBase() {
return base;
}
public List<FlexiaModel> getWordsFroms() {
return wordsFroms;
}
}

View File

@ -29,11 +29,11 @@ public class Heuristic {
private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();
public void addHeuristic(SuffixHeuristic suffixHeuristic) {
Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
Long longs = encodedSuffixesPairs.get(suffix);
if (longs == null) {
encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixHeuristic.getNormalSuffix()));
}
// Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
// Long longs = encodedSuffixesPairs.get(suffix);
// if (longs == null) {
// encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixHeuristic.getNormalSuffix()));
// }
}
public String getNormalForm(String form) {
@ -49,6 +49,10 @@ public class Heuristic {
return form;
}
public Integer getAmount() {
return encodedSuffixesPairs.size();
}
public void readFromFile(String file) throws IOException {
BufferedReader reader = new BufferedReader(new FileReader(file));
String s = reader.readLine();

View File

@ -0,0 +1,27 @@
package org.apache.lucene.russian.morphology.heuristic;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
public class HeuristicBySuffixLegth {
private Map<Long, Set<SuffixHeuristic>> heuristics = new HashMap<Long, Set<SuffixHeuristic>>();
public void addHeuristic(SuffixHeuristic suffixHeuristic) {
Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
Set<SuffixHeuristic> suffixHeuristics = heuristics.get(suffix);
if (suffixHeuristics == null) {
suffixHeuristics = new HashSet<SuffixHeuristic>();
heuristics.put(suffix, suffixHeuristics);
}
suffixHeuristics.add(suffixHeuristic);
}
public Map<Long, Set<SuffixHeuristic>> getHeuristics() {
return heuristics;
}
}

View File

@ -17,6 +17,8 @@
package org.apache.lucene.russian.morphology.heuristic;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import org.apache.lucene.russian.morphology.dictonary.FlexiaModel;
import org.apache.lucene.russian.morphology.dictonary.GrammaReader;
import org.apache.lucene.russian.morphology.dictonary.WordCard;
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
@ -27,16 +29,18 @@ import java.util.Map;
public class StatiticsCollectors implements WordProccessor {
Map<SuffixHeuristic, SuffixCounter> statititics = new HashMap<SuffixHeuristic, SuffixCounter>();
private Map<String, Double> wordsFreq;
private GrammaReader grammaInfo;
public StatiticsCollectors(Map<String, Double> wordsFreq) {
public StatiticsCollectors(Map<String, Double> wordsFreq, GrammaReader grammaInfo) {
this.wordsFreq = wordsFreq;
this.grammaInfo = grammaInfo;
}
private Integer ignoredCount = 0;
public void proccess(WordCard wordCard) {
for (String form : wordCard.getWordsFroms()) {
SuffixHeuristic suffixHeuristic = createEvristic(wordCard.getCanonicalFrom(), form);
for (FlexiaModel fm : wordCard.getWordsFroms()) {
SuffixHeuristic suffixHeuristic = createEvristic(wordCard.getCanonicalFrom(), wordCard.getCanonicalSuffix(), fm);
if (suffixHeuristic == null) continue;
SuffixCounter suffixCounter = statititics.get(suffixHeuristic);
if (suffixCounter == null) {
@ -57,19 +61,23 @@ public class StatiticsCollectors implements WordProccessor {
return statititics;
}
private SuffixHeuristic createEvristic(String word, String form) {
private SuffixHeuristic createEvristic(String wordBase, String canonicalSuffix, FlexiaModel fm) {
String form = fm.create(wordBase);
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
String formSuffix = form.substring(startSymbol);
if (word.length() < startSymbol) {
ignoredCount++;
return null;
}
String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : "";
if (wordSuffix.length() > 12) {
System.out.println(word + " " + form);
return null;
}
return new SuffixHeuristic(formSuffix, wordSuffix);
String actualSuffix = fm.getSuffix();
Integer actualSuffixLengh = actualSuffix.length();
// if (word.length() < startSymbol) {
// ignoredCount++;
// return null;
// }
// String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : "";
// if (wordSuffix.length() > 12) {
// System.out.println(word + " " + form);
// return null;
// }
// return new SuffixHeuristic(formSuffix, wordSuffix);
return new SuffixHeuristic(formSuffix, actualSuffixLengh, canonicalSuffix, fm.getCode());
}

View File

@ -24,27 +24,31 @@ package org.apache.lucene.russian.morphology.heuristic;
*/
public class SuffixHeuristic {
private String formSuffix;
private String normalSuffix;
private Integer actualSuffixLength;
private String normalFromSuffix;
private String morphInfoCode;
public SuffixHeuristic(String formSuffix, String normalSuffix) {
public SuffixHeuristic(String formSuffix, Integer actualSuffixLength, String normalFromSuffix, String morphInfoCode) {
this.formSuffix = formSuffix;
this.normalSuffix = normalSuffix;
this.actualSuffixLength = actualSuffixLength;
this.normalFromSuffix = normalFromSuffix;
this.morphInfoCode = morphInfoCode;
}
public String getFormSuffix() {
return formSuffix;
}
public void setFormSuffix(String formSuffix) {
this.formSuffix = formSuffix;
public Integer getActualSuffixLength() {
return actualSuffixLength;
}
public String getNormalSuffix() {
return normalSuffix;
public String getNormalFromSuffix() {
return normalFromSuffix;
}
public void setNormalSuffix(String normalSuffix) {
this.normalSuffix = normalSuffix;
public String getMorphInfoCode() {
return morphInfoCode;
}
@Override
@ -54,24 +58,28 @@ public class SuffixHeuristic {
SuffixHeuristic that = (SuffixHeuristic) o;
if (!formSuffix.equals(that.formSuffix)) return false;
if (!normalSuffix.equals(that.normalSuffix)) return false;
if (actualSuffixLength != null ? !actualSuffixLength.equals(that.actualSuffixLength) : that.actualSuffixLength != null)
return false;
if (formSuffix != null ? !formSuffix.equals(that.formSuffix) : that.formSuffix != null) return false;
if (morphInfoCode != null ? !morphInfoCode.equals(that.morphInfoCode) : that.morphInfoCode != null)
return false;
if (normalFromSuffix != null ? !normalFromSuffix.equals(that.normalFromSuffix) : that.normalFromSuffix != null)
return false;
return true;
}
@Override
public int hashCode() {
int result = formSuffix.hashCode();
result = 31 * result + normalSuffix.hashCode();
int result = formSuffix != null ? formSuffix.hashCode() : 0;
result = 31 * result + (actualSuffixLength != null ? actualSuffixLength.hashCode() : 0);
result = 31 * result + (normalFromSuffix != null ? normalFromSuffix.hashCode() : 0);
result = 31 * result + (morphInfoCode != null ? morphInfoCode.hashCode() : 0);
return result;
}
@Override
public String toString() {
return "SuffixHeuristic{" +
"formSuffix='" + formSuffix + '\'' +
", normalSuffix='" + normalSuffix + '\'' +
'}';
return formSuffix + " " + actualSuffixLength + " " + normalFromSuffix + " " + morphInfoCode;
}
}

View File

@ -0,0 +1,13 @@
package org.apache.lucene.russian.morphology.heuristic;
public class SuffixHeuristicMerger {
public SuffixHeuristic merge(SuffixHeuristic one, SuffixHeuristic two) {
if (!one.getMorphInfoCode().equals(two.getMorphInfoCode()))
return null;
SuffixHeuristic min = one.getActualSuffixLength() > two.getActualSuffixLength() ? two : one;
return null;
}
}

View File

@ -6,3 +6,4 @@
произошло произойти
test test
ананасов ананас
встовашего встовать