start working on new version with morphology info
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@37 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
214a8e2ebe
commit
e4dd3a7a76
@ -18,15 +18,18 @@ package org.apache.lucene.russian.morphology;
|
|||||||
|
|
||||||
import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
|
import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
|
||||||
import org.apache.lucene.russian.morphology.dictonary.FrequentyReader;
|
import org.apache.lucene.russian.morphology.dictonary.FrequentyReader;
|
||||||
|
import org.apache.lucene.russian.morphology.dictonary.GrammaReader;
|
||||||
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
|
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
|
||||||
import org.apache.lucene.russian.morphology.heuristic.Heuristic;
|
import org.apache.lucene.russian.morphology.heuristic.HeuristicBySuffixLegth;
|
||||||
import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors;
|
import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors;
|
||||||
import org.apache.lucene.russian.morphology.heuristic.SuffixCounter;
|
import org.apache.lucene.russian.morphology.heuristic.SuffixCounter;
|
||||||
|
import org.apache.lucene.russian.morphology.heuristic.SuffixHeuristic;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
|
||||||
|
|
||||||
public class HeuristicBuilder {
|
public class HeuristicBuilder {
|
||||||
@ -35,10 +38,11 @@ public class HeuristicBuilder {
|
|||||||
Set<String> form = formReader.getIngnoredFroms();
|
Set<String> form = formReader.getIngnoredFroms();
|
||||||
|
|
||||||
FrequentyReader frequentyReader = new FrequentyReader("data/lemma.num");
|
FrequentyReader frequentyReader = new FrequentyReader("data/lemma.num");
|
||||||
|
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
|
||||||
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form);
|
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form);
|
||||||
|
|
||||||
StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read());
|
|
||||||
|
StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read(), grammaInfo);
|
||||||
dictonaryReader.proccess(statiticsCollectors);
|
dictonaryReader.proccess(statiticsCollectors);
|
||||||
Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
|
Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
|
||||||
Object[] objects = counterCollection.toArray();
|
Object[] objects = counterCollection.toArray();
|
||||||
@ -48,11 +52,52 @@ public class HeuristicBuilder {
|
|||||||
System.out.println(objects[i]);
|
System.out.println(objects[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
final Heuristic heuristic = new Heuristic();
|
final HeuristicBySuffixLegth heuristic = new HeuristicBySuffixLegth();
|
||||||
for (int i = 0; i < objects.length; i++) {
|
for (int i = 0; i < objects.length; i++) {
|
||||||
heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic());
|
heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic());
|
||||||
}
|
}
|
||||||
|
|
||||||
heuristic.writeToFile("russianSuffixesHeuristic.txt");
|
TreeMap<Integer, Integer> map = new TreeMap<Integer, Integer>();
|
||||||
|
|
||||||
|
int ct = 0;
|
||||||
|
for (Set<SuffixHeuristic> s : heuristic.getHeuristics().values()) {
|
||||||
|
Integer d = map.get(s.size());
|
||||||
|
map.put(s.size(), 1 + (d == null ? 0 : d));
|
||||||
|
if (s.size() == 1) {
|
||||||
|
ct++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
SuffixHeuristic heuristic1 = s.iterator().next();
|
||||||
|
Integer sufixSize = heuristic1.getActualSuffixLength();
|
||||||
|
String normalSuffix = heuristic1.getNormalFromSuffix();
|
||||||
|
if (heuristic1.getFormSuffix().length() < 6) {
|
||||||
|
ct++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Boolean flag = true;
|
||||||
|
if (sufixSize > 3) continue;
|
||||||
|
for (SuffixHeuristic sh : s) {
|
||||||
|
flag = flag && (sufixSize.equals(sh.getActualSuffixLength()))
|
||||||
|
&& (normalSuffix.equals(sh.getNormalFromSuffix()));
|
||||||
|
}
|
||||||
|
if (flag) {
|
||||||
|
System.out.println(s);
|
||||||
|
ct++;
|
||||||
|
}
|
||||||
|
//HashSet<String> integers = new HashSet<String>();
|
||||||
|
// for(SuffixHeuristic sh:s){
|
||||||
|
// integers.add(sh.getMorphInfoCode());
|
||||||
|
// }
|
||||||
|
// if(s.size() == integers.size()){
|
||||||
|
// ct++;
|
||||||
|
// }else{
|
||||||
|
// if(s.size() == 2) System.out.println(s);
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
System.out.println(objects.length);
|
||||||
|
System.out.println(heuristic.getHeuristics().size());
|
||||||
|
System.out.println(ct);
|
||||||
|
System.out.println(map);
|
||||||
|
//heuristic.writeToFile("russianSuffixesHeuristic.txt");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -24,7 +24,7 @@ package org.apache.lucene.russian.morphology;
|
|||||||
*/
|
*/
|
||||||
public class RussianSuffixDecoderEncoder {
|
public class RussianSuffixDecoderEncoder {
|
||||||
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
|
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
|
||||||
public static final int SUFFIX_LENGTH = 7;
|
public static final int SUFFIX_LENGTH = 6;
|
||||||
public static final int EE_CHAR = 34;
|
public static final int EE_CHAR = 34;
|
||||||
public static final int E_CHAR = 6;
|
public static final int E_CHAR = 6;
|
||||||
public static final int DASH_CHAR = 45;
|
public static final int DASH_CHAR = 45;
|
||||||
|
13
src/main/java/org/apache/lucene/russian/morphology/Test.java
Normal file
13
src/main/java/org/apache/lucene/russian/morphology/Test.java
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
package org.apache.lucene.russian.morphology;
|
||||||
|
|
||||||
|
import org.apache.lucene.russian.morphology.dictonary.GrammaReader;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
|
||||||
|
public class Test {
|
||||||
|
public static void main(String[] args) throws IOException {
|
||||||
|
GrammaReader grammaReader = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
|
||||||
|
System.out.println(grammaReader.getInversIndex().size());
|
||||||
|
}
|
||||||
|
}
|
@ -63,17 +63,18 @@ public class DictonaryReader {
|
|||||||
int count = Integer.valueOf(s);
|
int count = Integer.valueOf(s);
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
s = reader.readLine();
|
s = reader.readLine();
|
||||||
if (i % 10000 == 0) System.out.println("Proccess " + i + " word of " + count);
|
if (i % 10000 == 0) System.out.println("Proccess " + i + " wordBase of " + count);
|
||||||
|
|
||||||
String[] wd = s.split(" ");
|
String[] wd = s.split(" ");
|
||||||
String word = wd[0].toLowerCase();
|
String wordBase = wd[0].toLowerCase();
|
||||||
if (word.startsWith("-")) continue;
|
if (wordBase.startsWith("-")) continue;
|
||||||
word = "#".equals(word) ? "" : word;
|
wordBase = "#".equals(wordBase) ? "" : wordBase;
|
||||||
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
|
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
|
||||||
if (models.size() > 0 && !ingnoredForm.contains(models.get(0).getCode())) {
|
FlexiaModel flexiaModel = models.get(0);
|
||||||
WordCard card = new WordCard(cleanString(models.get(0).create(word)));
|
if (models.size() > 0 && !ingnoredForm.contains(flexiaModel.getCode())) {
|
||||||
|
WordCard card = new WordCard(cleanString(flexiaModel.create(wordBase)), cleanString(wordBase), flexiaModel.getSuffix());
|
||||||
for (FlexiaModel fm : models) {
|
for (FlexiaModel fm : models) {
|
||||||
card.addFrom(cleanString(fm.create(word)));
|
card.addFlexia(fm);
|
||||||
}
|
}
|
||||||
wordProccessor.proccess(card);
|
wordProccessor.proccess(card);
|
||||||
}
|
}
|
||||||
@ -118,9 +119,10 @@ public class DictonaryReader {
|
|||||||
private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
|
private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
|
||||||
String[] fl = line.split("\\*");
|
String[] fl = line.split("\\*");
|
||||||
// we inored all forms thats
|
// we inored all forms thats
|
||||||
// if (fl.length == 3)
|
if (fl.length == 3) {
|
||||||
// flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
|
flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase())));
|
||||||
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
|
}
|
||||||
|
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), ""));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,58 @@
|
|||||||
|
package org.apache.lucene.russian.morphology.dictonary;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
//todo spleet this class on two.
|
||||||
|
public class GrammaReader {
|
||||||
|
private String fileName;
|
||||||
|
private String fileEncoding = "windows-1251";
|
||||||
|
private Map<Integer, String> grammaInfo = new HashMap<Integer, String>();
|
||||||
|
private Map<String, Integer> inversIndex = new HashMap<String, Integer>();
|
||||||
|
|
||||||
|
public GrammaReader(String fileName) throws IOException {
|
||||||
|
this.fileName = fileName;
|
||||||
|
setUp();
|
||||||
|
}
|
||||||
|
|
||||||
|
public GrammaReader(String fileName, String fileEncoding) throws IOException {
|
||||||
|
this.fileName = fileName;
|
||||||
|
this.fileEncoding = fileEncoding;
|
||||||
|
setUp();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setUp() throws IOException {
|
||||||
|
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), fileEncoding));
|
||||||
|
String line = bufferedReader.readLine();
|
||||||
|
while (line != null) {
|
||||||
|
line = line.trim();
|
||||||
|
if (!line.startsWith("//") && line.length() > 0) {
|
||||||
|
String[] strings = line.split(" ", 2);
|
||||||
|
Integer i = grammaInfo.size();
|
||||||
|
inversIndex.put(strings[0], i);
|
||||||
|
grammaInfo.put(i, strings[1]);
|
||||||
|
}
|
||||||
|
line = bufferedReader.readLine();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<Integer, String> getGrammaInfo() {
|
||||||
|
return grammaInfo;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setGrammaInfo(Map<Integer, String> grammaInfo) {
|
||||||
|
this.grammaInfo = grammaInfo;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, Integer> getInversIndex() {
|
||||||
|
return inversIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setInversIndex(Map<String, Integer> inversIndex) {
|
||||||
|
this.inversIndex = inversIndex;
|
||||||
|
}
|
||||||
|
}
|
@ -24,21 +24,33 @@ import java.util.List;
|
|||||||
*/
|
*/
|
||||||
public class WordCard {
|
public class WordCard {
|
||||||
private String canonicalFrom;
|
private String canonicalFrom;
|
||||||
private List<String> wordsFroms = new ArrayList<String>();
|
private String base;
|
||||||
|
private String canonicalSuffix;
|
||||||
|
private List<FlexiaModel> wordsFroms = new ArrayList<FlexiaModel>();
|
||||||
|
|
||||||
protected WordCard(String canonicalFrom) {
|
public WordCard(String canonicalFrom, String base, String canonicalSuffix) {
|
||||||
this.canonicalFrom = canonicalFrom;
|
this.canonicalFrom = canonicalFrom;
|
||||||
|
this.canonicalSuffix = canonicalSuffix;
|
||||||
|
this.base = base;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void addFrom(String word) {
|
public void addFlexia(FlexiaModel flexiaModel) {
|
||||||
wordsFroms.add(word);
|
wordsFroms.add(flexiaModel);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getCanonicalFrom() {
|
public String getCanonicalFrom() {
|
||||||
return canonicalFrom;
|
return canonicalFrom;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getWordsFroms() {
|
public String getCanonicalSuffix() {
|
||||||
|
return canonicalSuffix;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getBase() {
|
||||||
|
return base;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<FlexiaModel> getWordsFroms() {
|
||||||
return wordsFroms;
|
return wordsFroms;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -29,11 +29,11 @@ public class Heuristic {
|
|||||||
private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();
|
private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();
|
||||||
|
|
||||||
public void addHeuristic(SuffixHeuristic suffixHeuristic) {
|
public void addHeuristic(SuffixHeuristic suffixHeuristic) {
|
||||||
Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
|
// Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
|
||||||
Long longs = encodedSuffixesPairs.get(suffix);
|
// Long longs = encodedSuffixesPairs.get(suffix);
|
||||||
if (longs == null) {
|
// if (longs == null) {
|
||||||
encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixHeuristic.getNormalSuffix()));
|
// encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixHeuristic.getNormalSuffix()));
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getNormalForm(String form) {
|
public String getNormalForm(String form) {
|
||||||
@ -49,6 +49,10 @@ public class Heuristic {
|
|||||||
return form;
|
return form;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Integer getAmount() {
|
||||||
|
return encodedSuffixesPairs.size();
|
||||||
|
}
|
||||||
|
|
||||||
public void readFromFile(String file) throws IOException {
|
public void readFromFile(String file) throws IOException {
|
||||||
BufferedReader reader = new BufferedReader(new FileReader(file));
|
BufferedReader reader = new BufferedReader(new FileReader(file));
|
||||||
String s = reader.readLine();
|
String s = reader.readLine();
|
||||||
|
@ -0,0 +1,27 @@
|
|||||||
|
package org.apache.lucene.russian.morphology.heuristic;
|
||||||
|
|
||||||
|
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
|
||||||
|
public class HeuristicBySuffixLegth {
|
||||||
|
private Map<Long, Set<SuffixHeuristic>> heuristics = new HashMap<Long, Set<SuffixHeuristic>>();
|
||||||
|
|
||||||
|
public void addHeuristic(SuffixHeuristic suffixHeuristic) {
|
||||||
|
Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
|
||||||
|
Set<SuffixHeuristic> suffixHeuristics = heuristics.get(suffix);
|
||||||
|
if (suffixHeuristics == null) {
|
||||||
|
suffixHeuristics = new HashSet<SuffixHeuristic>();
|
||||||
|
heuristics.put(suffix, suffixHeuristics);
|
||||||
|
}
|
||||||
|
suffixHeuristics.add(suffixHeuristic);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<Long, Set<SuffixHeuristic>> getHeuristics() {
|
||||||
|
return heuristics;
|
||||||
|
}
|
||||||
|
}
|
@ -17,6 +17,8 @@
|
|||||||
package org.apache.lucene.russian.morphology.heuristic;
|
package org.apache.lucene.russian.morphology.heuristic;
|
||||||
|
|
||||||
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||||
|
import org.apache.lucene.russian.morphology.dictonary.FlexiaModel;
|
||||||
|
import org.apache.lucene.russian.morphology.dictonary.GrammaReader;
|
||||||
import org.apache.lucene.russian.morphology.dictonary.WordCard;
|
import org.apache.lucene.russian.morphology.dictonary.WordCard;
|
||||||
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
|
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
|
||||||
|
|
||||||
@ -27,16 +29,18 @@ import java.util.Map;
|
|||||||
public class StatiticsCollectors implements WordProccessor {
|
public class StatiticsCollectors implements WordProccessor {
|
||||||
Map<SuffixHeuristic, SuffixCounter> statititics = new HashMap<SuffixHeuristic, SuffixCounter>();
|
Map<SuffixHeuristic, SuffixCounter> statititics = new HashMap<SuffixHeuristic, SuffixCounter>();
|
||||||
private Map<String, Double> wordsFreq;
|
private Map<String, Double> wordsFreq;
|
||||||
|
private GrammaReader grammaInfo;
|
||||||
|
|
||||||
public StatiticsCollectors(Map<String, Double> wordsFreq) {
|
public StatiticsCollectors(Map<String, Double> wordsFreq, GrammaReader grammaInfo) {
|
||||||
this.wordsFreq = wordsFreq;
|
this.wordsFreq = wordsFreq;
|
||||||
|
this.grammaInfo = grammaInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
private Integer ignoredCount = 0;
|
private Integer ignoredCount = 0;
|
||||||
|
|
||||||
public void proccess(WordCard wordCard) {
|
public void proccess(WordCard wordCard) {
|
||||||
for (String form : wordCard.getWordsFroms()) {
|
for (FlexiaModel fm : wordCard.getWordsFroms()) {
|
||||||
SuffixHeuristic suffixHeuristic = createEvristic(wordCard.getCanonicalFrom(), form);
|
SuffixHeuristic suffixHeuristic = createEvristic(wordCard.getCanonicalFrom(), wordCard.getCanonicalSuffix(), fm);
|
||||||
if (suffixHeuristic == null) continue;
|
if (suffixHeuristic == null) continue;
|
||||||
SuffixCounter suffixCounter = statititics.get(suffixHeuristic);
|
SuffixCounter suffixCounter = statititics.get(suffixHeuristic);
|
||||||
if (suffixCounter == null) {
|
if (suffixCounter == null) {
|
||||||
@ -57,19 +61,23 @@ public class StatiticsCollectors implements WordProccessor {
|
|||||||
return statititics;
|
return statititics;
|
||||||
}
|
}
|
||||||
|
|
||||||
private SuffixHeuristic createEvristic(String word, String form) {
|
private SuffixHeuristic createEvristic(String wordBase, String canonicalSuffix, FlexiaModel fm) {
|
||||||
|
String form = fm.create(wordBase);
|
||||||
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
||||||
String formSuffix = form.substring(startSymbol);
|
String formSuffix = form.substring(startSymbol);
|
||||||
if (word.length() < startSymbol) {
|
String actualSuffix = fm.getSuffix();
|
||||||
ignoredCount++;
|
Integer actualSuffixLengh = actualSuffix.length();
|
||||||
return null;
|
// if (word.length() < startSymbol) {
|
||||||
}
|
// ignoredCount++;
|
||||||
String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : "";
|
// return null;
|
||||||
if (wordSuffix.length() > 12) {
|
// }
|
||||||
System.out.println(word + " " + form);
|
// String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : "";
|
||||||
return null;
|
// if (wordSuffix.length() > 12) {
|
||||||
}
|
// System.out.println(word + " " + form);
|
||||||
return new SuffixHeuristic(formSuffix, wordSuffix);
|
// return null;
|
||||||
|
// }
|
||||||
|
// return new SuffixHeuristic(formSuffix, wordSuffix);
|
||||||
|
return new SuffixHeuristic(formSuffix, actualSuffixLengh, canonicalSuffix, fm.getCode());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,27 +24,31 @@ package org.apache.lucene.russian.morphology.heuristic;
|
|||||||
*/
|
*/
|
||||||
public class SuffixHeuristic {
|
public class SuffixHeuristic {
|
||||||
private String formSuffix;
|
private String formSuffix;
|
||||||
private String normalSuffix;
|
private Integer actualSuffixLength;
|
||||||
|
private String normalFromSuffix;
|
||||||
|
private String morphInfoCode;
|
||||||
|
|
||||||
public SuffixHeuristic(String formSuffix, String normalSuffix) {
|
public SuffixHeuristic(String formSuffix, Integer actualSuffixLength, String normalFromSuffix, String morphInfoCode) {
|
||||||
this.formSuffix = formSuffix;
|
this.formSuffix = formSuffix;
|
||||||
this.normalSuffix = normalSuffix;
|
this.actualSuffixLength = actualSuffixLength;
|
||||||
|
this.normalFromSuffix = normalFromSuffix;
|
||||||
|
this.morphInfoCode = morphInfoCode;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getFormSuffix() {
|
public String getFormSuffix() {
|
||||||
return formSuffix;
|
return formSuffix;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setFormSuffix(String formSuffix) {
|
public Integer getActualSuffixLength() {
|
||||||
this.formSuffix = formSuffix;
|
return actualSuffixLength;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getNormalSuffix() {
|
public String getNormalFromSuffix() {
|
||||||
return normalSuffix;
|
return normalFromSuffix;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setNormalSuffix(String normalSuffix) {
|
public String getMorphInfoCode() {
|
||||||
this.normalSuffix = normalSuffix;
|
return morphInfoCode;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -54,24 +58,28 @@ public class SuffixHeuristic {
|
|||||||
|
|
||||||
SuffixHeuristic that = (SuffixHeuristic) o;
|
SuffixHeuristic that = (SuffixHeuristic) o;
|
||||||
|
|
||||||
if (!formSuffix.equals(that.formSuffix)) return false;
|
if (actualSuffixLength != null ? !actualSuffixLength.equals(that.actualSuffixLength) : that.actualSuffixLength != null)
|
||||||
if (!normalSuffix.equals(that.normalSuffix)) return false;
|
return false;
|
||||||
|
if (formSuffix != null ? !formSuffix.equals(that.formSuffix) : that.formSuffix != null) return false;
|
||||||
|
if (morphInfoCode != null ? !morphInfoCode.equals(that.morphInfoCode) : that.morphInfoCode != null)
|
||||||
|
return false;
|
||||||
|
if (normalFromSuffix != null ? !normalFromSuffix.equals(that.normalFromSuffix) : that.normalFromSuffix != null)
|
||||||
|
return false;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
int result = formSuffix.hashCode();
|
int result = formSuffix != null ? formSuffix.hashCode() : 0;
|
||||||
result = 31 * result + normalSuffix.hashCode();
|
result = 31 * result + (actualSuffixLength != null ? actualSuffixLength.hashCode() : 0);
|
||||||
|
result = 31 * result + (normalFromSuffix != null ? normalFromSuffix.hashCode() : 0);
|
||||||
|
result = 31 * result + (morphInfoCode != null ? morphInfoCode.hashCode() : 0);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "SuffixHeuristic{" +
|
return formSuffix + " " + actualSuffixLength + " " + normalFromSuffix + " " + morphInfoCode;
|
||||||
"formSuffix='" + formSuffix + '\'' +
|
|
||||||
", normalSuffix='" + normalSuffix + '\'' +
|
|
||||||
'}';
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,13 @@
|
|||||||
|
package org.apache.lucene.russian.morphology.heuristic;
|
||||||
|
|
||||||
|
|
||||||
|
public class SuffixHeuristicMerger {
|
||||||
|
|
||||||
|
public SuffixHeuristic merge(SuffixHeuristic one, SuffixHeuristic two) {
|
||||||
|
if (!one.getMorphInfoCode().equals(two.getMorphInfoCode()))
|
||||||
|
return null;
|
||||||
|
SuffixHeuristic min = one.getActualSuffixLength() > two.getActualSuffixLength() ? two : one;
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
@ -6,3 +6,4 @@
|
|||||||
произошло произойти
|
произошло произойти
|
||||||
test test
|
test test
|
||||||
ананасов ананас
|
ананасов ананас
|
||||||
|
встовашего встовать
|
Loading…
x
Reference in New Issue
Block a user