start working on new version with morphology info
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@37 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
214a8e2ebe
commit
e4dd3a7a76
@ -18,15 +18,18 @@ package org.apache.lucene.russian.morphology;
|
||||
|
||||
import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
|
||||
import org.apache.lucene.russian.morphology.dictonary.FrequentyReader;
|
||||
import org.apache.lucene.russian.morphology.dictonary.GrammaReader;
|
||||
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
|
||||
import org.apache.lucene.russian.morphology.heuristic.Heuristic;
|
||||
import org.apache.lucene.russian.morphology.heuristic.HeuristicBySuffixLegth;
|
||||
import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors;
|
||||
import org.apache.lucene.russian.morphology.heuristic.SuffixCounter;
|
||||
import org.apache.lucene.russian.morphology.heuristic.SuffixHeuristic;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
|
||||
|
||||
public class HeuristicBuilder {
|
||||
@ -35,10 +38,11 @@ public class HeuristicBuilder {
|
||||
Set<String> form = formReader.getIngnoredFroms();
|
||||
|
||||
FrequentyReader frequentyReader = new FrequentyReader("data/lemma.num");
|
||||
|
||||
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
|
||||
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form);
|
||||
|
||||
StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read());
|
||||
|
||||
StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read(), grammaInfo);
|
||||
dictonaryReader.proccess(statiticsCollectors);
|
||||
Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
|
||||
Object[] objects = counterCollection.toArray();
|
||||
@ -48,11 +52,52 @@ public class HeuristicBuilder {
|
||||
System.out.println(objects[i]);
|
||||
}
|
||||
|
||||
final Heuristic heuristic = new Heuristic();
|
||||
final HeuristicBySuffixLegth heuristic = new HeuristicBySuffixLegth();
|
||||
for (int i = 0; i < objects.length; i++) {
|
||||
heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic());
|
||||
}
|
||||
|
||||
heuristic.writeToFile("russianSuffixesHeuristic.txt");
|
||||
TreeMap<Integer, Integer> map = new TreeMap<Integer, Integer>();
|
||||
|
||||
int ct = 0;
|
||||
for (Set<SuffixHeuristic> s : heuristic.getHeuristics().values()) {
|
||||
Integer d = map.get(s.size());
|
||||
map.put(s.size(), 1 + (d == null ? 0 : d));
|
||||
if (s.size() == 1) {
|
||||
ct++;
|
||||
continue;
|
||||
}
|
||||
SuffixHeuristic heuristic1 = s.iterator().next();
|
||||
Integer sufixSize = heuristic1.getActualSuffixLength();
|
||||
String normalSuffix = heuristic1.getNormalFromSuffix();
|
||||
if (heuristic1.getFormSuffix().length() < 6) {
|
||||
ct++;
|
||||
continue;
|
||||
}
|
||||
Boolean flag = true;
|
||||
if (sufixSize > 3) continue;
|
||||
for (SuffixHeuristic sh : s) {
|
||||
flag = flag && (sufixSize.equals(sh.getActualSuffixLength()))
|
||||
&& (normalSuffix.equals(sh.getNormalFromSuffix()));
|
||||
}
|
||||
if (flag) {
|
||||
System.out.println(s);
|
||||
ct++;
|
||||
}
|
||||
//HashSet<String> integers = new HashSet<String>();
|
||||
// for(SuffixHeuristic sh:s){
|
||||
// integers.add(sh.getMorphInfoCode());
|
||||
// }
|
||||
// if(s.size() == integers.size()){
|
||||
// ct++;
|
||||
// }else{
|
||||
// if(s.size() == 2) System.out.println(s);
|
||||
// }
|
||||
}
|
||||
System.out.println(objects.length);
|
||||
System.out.println(heuristic.getHeuristics().size());
|
||||
System.out.println(ct);
|
||||
System.out.println(map);
|
||||
//heuristic.writeToFile("russianSuffixesHeuristic.txt");
|
||||
}
|
||||
}
|
||||
|
@ -24,7 +24,7 @@ package org.apache.lucene.russian.morphology;
|
||||
*/
|
||||
public class RussianSuffixDecoderEncoder {
|
||||
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
|
||||
public static final int SUFFIX_LENGTH = 7;
|
||||
public static final int SUFFIX_LENGTH = 6;
|
||||
public static final int EE_CHAR = 34;
|
||||
public static final int E_CHAR = 6;
|
||||
public static final int DASH_CHAR = 45;
|
||||
|
13
src/main/java/org/apache/lucene/russian/morphology/Test.java
Normal file
13
src/main/java/org/apache/lucene/russian/morphology/Test.java
Normal file
@ -0,0 +1,13 @@
|
||||
package org.apache.lucene.russian.morphology;
|
||||
|
||||
import org.apache.lucene.russian.morphology.dictonary.GrammaReader;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
public class Test {
|
||||
public static void main(String[] args) throws IOException {
|
||||
GrammaReader grammaReader = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
|
||||
System.out.println(grammaReader.getInversIndex().size());
|
||||
}
|
||||
}
|
@ -63,17 +63,18 @@ public class DictonaryReader {
|
||||
int count = Integer.valueOf(s);
|
||||
for (int i = 0; i < count; i++) {
|
||||
s = reader.readLine();
|
||||
if (i % 10000 == 0) System.out.println("Proccess " + i + " word of " + count);
|
||||
if (i % 10000 == 0) System.out.println("Proccess " + i + " wordBase of " + count);
|
||||
|
||||
String[] wd = s.split(" ");
|
||||
String word = wd[0].toLowerCase();
|
||||
if (word.startsWith("-")) continue;
|
||||
word = "#".equals(word) ? "" : word;
|
||||
String wordBase = wd[0].toLowerCase();
|
||||
if (wordBase.startsWith("-")) continue;
|
||||
wordBase = "#".equals(wordBase) ? "" : wordBase;
|
||||
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
|
||||
if (models.size() > 0 && !ingnoredForm.contains(models.get(0).getCode())) {
|
||||
WordCard card = new WordCard(cleanString(models.get(0).create(word)));
|
||||
FlexiaModel flexiaModel = models.get(0);
|
||||
if (models.size() > 0 && !ingnoredForm.contains(flexiaModel.getCode())) {
|
||||
WordCard card = new WordCard(cleanString(flexiaModel.create(wordBase)), cleanString(wordBase), flexiaModel.getSuffix());
|
||||
for (FlexiaModel fm : models) {
|
||||
card.addFrom(cleanString(fm.create(word)));
|
||||
card.addFlexia(fm);
|
||||
}
|
||||
wordProccessor.proccess(card);
|
||||
}
|
||||
@ -118,9 +119,10 @@ public class DictonaryReader {
|
||||
private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
|
||||
String[] fl = line.split("\\*");
|
||||
// we inored all forms thats
|
||||
// if (fl.length == 3)
|
||||
// flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
|
||||
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
|
||||
if (fl.length == 3) {
|
||||
flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase())));
|
||||
}
|
||||
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), ""));
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,58 @@
|
||||
package org.apache.lucene.russian.morphology.dictonary;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
//todo spleet this class on two.
|
||||
public class GrammaReader {
|
||||
private String fileName;
|
||||
private String fileEncoding = "windows-1251";
|
||||
private Map<Integer, String> grammaInfo = new HashMap<Integer, String>();
|
||||
private Map<String, Integer> inversIndex = new HashMap<String, Integer>();
|
||||
|
||||
public GrammaReader(String fileName) throws IOException {
|
||||
this.fileName = fileName;
|
||||
setUp();
|
||||
}
|
||||
|
||||
public GrammaReader(String fileName, String fileEncoding) throws IOException {
|
||||
this.fileName = fileName;
|
||||
this.fileEncoding = fileEncoding;
|
||||
setUp();
|
||||
}
|
||||
|
||||
private void setUp() throws IOException {
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), fileEncoding));
|
||||
String line = bufferedReader.readLine();
|
||||
while (line != null) {
|
||||
line = line.trim();
|
||||
if (!line.startsWith("//") && line.length() > 0) {
|
||||
String[] strings = line.split(" ", 2);
|
||||
Integer i = grammaInfo.size();
|
||||
inversIndex.put(strings[0], i);
|
||||
grammaInfo.put(i, strings[1]);
|
||||
}
|
||||
line = bufferedReader.readLine();
|
||||
}
|
||||
}
|
||||
|
||||
public Map<Integer, String> getGrammaInfo() {
|
||||
return grammaInfo;
|
||||
}
|
||||
|
||||
public void setGrammaInfo(Map<Integer, String> grammaInfo) {
|
||||
this.grammaInfo = grammaInfo;
|
||||
}
|
||||
|
||||
public Map<String, Integer> getInversIndex() {
|
||||
return inversIndex;
|
||||
}
|
||||
|
||||
public void setInversIndex(Map<String, Integer> inversIndex) {
|
||||
this.inversIndex = inversIndex;
|
||||
}
|
||||
}
|
@ -24,21 +24,33 @@ import java.util.List;
|
||||
*/
|
||||
public class WordCard {
|
||||
private String canonicalFrom;
|
||||
private List<String> wordsFroms = new ArrayList<String>();
|
||||
private String base;
|
||||
private String canonicalSuffix;
|
||||
private List<FlexiaModel> wordsFroms = new ArrayList<FlexiaModel>();
|
||||
|
||||
protected WordCard(String canonicalFrom) {
|
||||
public WordCard(String canonicalFrom, String base, String canonicalSuffix) {
|
||||
this.canonicalFrom = canonicalFrom;
|
||||
this.canonicalSuffix = canonicalSuffix;
|
||||
this.base = base;
|
||||
}
|
||||
|
||||
protected void addFrom(String word) {
|
||||
wordsFroms.add(word);
|
||||
public void addFlexia(FlexiaModel flexiaModel) {
|
||||
wordsFroms.add(flexiaModel);
|
||||
}
|
||||
|
||||
public String getCanonicalFrom() {
|
||||
return canonicalFrom;
|
||||
}
|
||||
|
||||
public List<String> getWordsFroms() {
|
||||
public String getCanonicalSuffix() {
|
||||
return canonicalSuffix;
|
||||
}
|
||||
|
||||
public String getBase() {
|
||||
return base;
|
||||
}
|
||||
|
||||
public List<FlexiaModel> getWordsFroms() {
|
||||
return wordsFroms;
|
||||
}
|
||||
}
|
||||
|
@ -29,11 +29,11 @@ public class Heuristic {
|
||||
private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();
|
||||
|
||||
public void addHeuristic(SuffixHeuristic suffixHeuristic) {
|
||||
Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
|
||||
Long longs = encodedSuffixesPairs.get(suffix);
|
||||
if (longs == null) {
|
||||
encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixHeuristic.getNormalSuffix()));
|
||||
}
|
||||
// Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
|
||||
// Long longs = encodedSuffixesPairs.get(suffix);
|
||||
// if (longs == null) {
|
||||
// encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixHeuristic.getNormalSuffix()));
|
||||
// }
|
||||
}
|
||||
|
||||
public String getNormalForm(String form) {
|
||||
@ -49,6 +49,10 @@ public class Heuristic {
|
||||
return form;
|
||||
}
|
||||
|
||||
public Integer getAmount() {
|
||||
return encodedSuffixesPairs.size();
|
||||
}
|
||||
|
||||
public void readFromFile(String file) throws IOException {
|
||||
BufferedReader reader = new BufferedReader(new FileReader(file));
|
||||
String s = reader.readLine();
|
||||
|
@ -0,0 +1,27 @@
|
||||
package org.apache.lucene.russian.morphology.heuristic;
|
||||
|
||||
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
public class HeuristicBySuffixLegth {
|
||||
private Map<Long, Set<SuffixHeuristic>> heuristics = new HashMap<Long, Set<SuffixHeuristic>>();
|
||||
|
||||
public void addHeuristic(SuffixHeuristic suffixHeuristic) {
|
||||
Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
|
||||
Set<SuffixHeuristic> suffixHeuristics = heuristics.get(suffix);
|
||||
if (suffixHeuristics == null) {
|
||||
suffixHeuristics = new HashSet<SuffixHeuristic>();
|
||||
heuristics.put(suffix, suffixHeuristics);
|
||||
}
|
||||
suffixHeuristics.add(suffixHeuristic);
|
||||
}
|
||||
|
||||
public Map<Long, Set<SuffixHeuristic>> getHeuristics() {
|
||||
return heuristics;
|
||||
}
|
||||
}
|
@ -17,6 +17,8 @@
|
||||
package org.apache.lucene.russian.morphology.heuristic;
|
||||
|
||||
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||
import org.apache.lucene.russian.morphology.dictonary.FlexiaModel;
|
||||
import org.apache.lucene.russian.morphology.dictonary.GrammaReader;
|
||||
import org.apache.lucene.russian.morphology.dictonary.WordCard;
|
||||
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
|
||||
|
||||
@ -27,16 +29,18 @@ import java.util.Map;
|
||||
public class StatiticsCollectors implements WordProccessor {
|
||||
Map<SuffixHeuristic, SuffixCounter> statititics = new HashMap<SuffixHeuristic, SuffixCounter>();
|
||||
private Map<String, Double> wordsFreq;
|
||||
private GrammaReader grammaInfo;
|
||||
|
||||
public StatiticsCollectors(Map<String, Double> wordsFreq) {
|
||||
public StatiticsCollectors(Map<String, Double> wordsFreq, GrammaReader grammaInfo) {
|
||||
this.wordsFreq = wordsFreq;
|
||||
this.grammaInfo = grammaInfo;
|
||||
}
|
||||
|
||||
private Integer ignoredCount = 0;
|
||||
|
||||
public void proccess(WordCard wordCard) {
|
||||
for (String form : wordCard.getWordsFroms()) {
|
||||
SuffixHeuristic suffixHeuristic = createEvristic(wordCard.getCanonicalFrom(), form);
|
||||
for (FlexiaModel fm : wordCard.getWordsFroms()) {
|
||||
SuffixHeuristic suffixHeuristic = createEvristic(wordCard.getCanonicalFrom(), wordCard.getCanonicalSuffix(), fm);
|
||||
if (suffixHeuristic == null) continue;
|
||||
SuffixCounter suffixCounter = statititics.get(suffixHeuristic);
|
||||
if (suffixCounter == null) {
|
||||
@ -57,19 +61,23 @@ public class StatiticsCollectors implements WordProccessor {
|
||||
return statititics;
|
||||
}
|
||||
|
||||
private SuffixHeuristic createEvristic(String word, String form) {
|
||||
private SuffixHeuristic createEvristic(String wordBase, String canonicalSuffix, FlexiaModel fm) {
|
||||
String form = fm.create(wordBase);
|
||||
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
||||
String formSuffix = form.substring(startSymbol);
|
||||
if (word.length() < startSymbol) {
|
||||
ignoredCount++;
|
||||
return null;
|
||||
}
|
||||
String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : "";
|
||||
if (wordSuffix.length() > 12) {
|
||||
System.out.println(word + " " + form);
|
||||
return null;
|
||||
}
|
||||
return new SuffixHeuristic(formSuffix, wordSuffix);
|
||||
String actualSuffix = fm.getSuffix();
|
||||
Integer actualSuffixLengh = actualSuffix.length();
|
||||
// if (word.length() < startSymbol) {
|
||||
// ignoredCount++;
|
||||
// return null;
|
||||
// }
|
||||
// String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : "";
|
||||
// if (wordSuffix.length() > 12) {
|
||||
// System.out.println(word + " " + form);
|
||||
// return null;
|
||||
// }
|
||||
// return new SuffixHeuristic(formSuffix, wordSuffix);
|
||||
return new SuffixHeuristic(formSuffix, actualSuffixLengh, canonicalSuffix, fm.getCode());
|
||||
}
|
||||
|
||||
|
||||
|
@ -24,27 +24,31 @@ package org.apache.lucene.russian.morphology.heuristic;
|
||||
*/
|
||||
public class SuffixHeuristic {
|
||||
private String formSuffix;
|
||||
private String normalSuffix;
|
||||
private Integer actualSuffixLength;
|
||||
private String normalFromSuffix;
|
||||
private String morphInfoCode;
|
||||
|
||||
public SuffixHeuristic(String formSuffix, String normalSuffix) {
|
||||
public SuffixHeuristic(String formSuffix, Integer actualSuffixLength, String normalFromSuffix, String morphInfoCode) {
|
||||
this.formSuffix = formSuffix;
|
||||
this.normalSuffix = normalSuffix;
|
||||
this.actualSuffixLength = actualSuffixLength;
|
||||
this.normalFromSuffix = normalFromSuffix;
|
||||
this.morphInfoCode = morphInfoCode;
|
||||
}
|
||||
|
||||
public String getFormSuffix() {
|
||||
return formSuffix;
|
||||
}
|
||||
|
||||
public void setFormSuffix(String formSuffix) {
|
||||
this.formSuffix = formSuffix;
|
||||
public Integer getActualSuffixLength() {
|
||||
return actualSuffixLength;
|
||||
}
|
||||
|
||||
public String getNormalSuffix() {
|
||||
return normalSuffix;
|
||||
public String getNormalFromSuffix() {
|
||||
return normalFromSuffix;
|
||||
}
|
||||
|
||||
public void setNormalSuffix(String normalSuffix) {
|
||||
this.normalSuffix = normalSuffix;
|
||||
public String getMorphInfoCode() {
|
||||
return morphInfoCode;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -54,24 +58,28 @@ public class SuffixHeuristic {
|
||||
|
||||
SuffixHeuristic that = (SuffixHeuristic) o;
|
||||
|
||||
if (!formSuffix.equals(that.formSuffix)) return false;
|
||||
if (!normalSuffix.equals(that.normalSuffix)) return false;
|
||||
if (actualSuffixLength != null ? !actualSuffixLength.equals(that.actualSuffixLength) : that.actualSuffixLength != null)
|
||||
return false;
|
||||
if (formSuffix != null ? !formSuffix.equals(that.formSuffix) : that.formSuffix != null) return false;
|
||||
if (morphInfoCode != null ? !morphInfoCode.equals(that.morphInfoCode) : that.morphInfoCode != null)
|
||||
return false;
|
||||
if (normalFromSuffix != null ? !normalFromSuffix.equals(that.normalFromSuffix) : that.normalFromSuffix != null)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = formSuffix.hashCode();
|
||||
result = 31 * result + normalSuffix.hashCode();
|
||||
int result = formSuffix != null ? formSuffix.hashCode() : 0;
|
||||
result = 31 * result + (actualSuffixLength != null ? actualSuffixLength.hashCode() : 0);
|
||||
result = 31 * result + (normalFromSuffix != null ? normalFromSuffix.hashCode() : 0);
|
||||
result = 31 * result + (morphInfoCode != null ? morphInfoCode.hashCode() : 0);
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "SuffixHeuristic{" +
|
||||
"formSuffix='" + formSuffix + '\'' +
|
||||
", normalSuffix='" + normalSuffix + '\'' +
|
||||
'}';
|
||||
return formSuffix + " " + actualSuffixLength + " " + normalFromSuffix + " " + morphInfoCode;
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,13 @@
|
||||
package org.apache.lucene.russian.morphology.heuristic;
|
||||
|
||||
|
||||
public class SuffixHeuristicMerger {
|
||||
|
||||
public SuffixHeuristic merge(SuffixHeuristic one, SuffixHeuristic two) {
|
||||
if (!one.getMorphInfoCode().equals(two.getMorphInfoCode()))
|
||||
return null;
|
||||
SuffixHeuristic min = one.getActualSuffixLength() > two.getActualSuffixLength() ? two : one;
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
@ -5,4 +5,5 @@
|
||||
поэтическая поэтический
|
||||
произошло произойти
|
||||
test test
|
||||
ананасов ананас
|
||||
ананасов ананас
|
||||
встовашего встовать
|
Loading…
x
Reference in New Issue
Block a user