first commit
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@2 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
13
src/main/java/org/apache/lucene/App.java
Normal file
13
src/main/java/org/apache/lucene/App.java
Normal file
@ -0,0 +1,13 @@
|
||||
package org.apache.lucene;
|
||||
|
||||
/**
|
||||
* Hello world!
|
||||
*
|
||||
*/
|
||||
public class App
|
||||
{
|
||||
public static void main( String[] args )
|
||||
{
|
||||
System.out.println( "Hello World!" );
|
||||
}
|
||||
}
|
@ -0,0 +1,87 @@
|
||||
package org.apache.lucene.russian.morphology;
|
||||
|
||||
import org.apache.lucene.russian.morphology.dictonary.DirtonaryReader;
|
||||
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
|
||||
import org.apache.lucene.russian.morphology.dictonary.WordCard;
|
||||
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
|
||||
import org.apache.lucene.russian.morphology.evristics.StatiticsCollectors;
|
||||
import org.apache.lucene.russian.morphology.evristics.SuffixCounter;
|
||||
import org.apache.lucene.russian.morphology.evristics.Evristic;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
|
||||
public class SuffixResearcher {
|
||||
public static void main(String[] args) throws IOException {
|
||||
IgnoredFormReader formReader = new IgnoredFormReader("igoredFrom.txt");
|
||||
Set<String> form = formReader.getIngnoredFroms();
|
||||
System.out.println(form);
|
||||
DirtonaryReader dirtonaryReader = new DirtonaryReader("morphs.mrd", form);
|
||||
StatiticsCollectors statiticsCollectors = new StatiticsCollectors();
|
||||
dirtonaryReader.proccess(statiticsCollectors);
|
||||
Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
|
||||
Object[] objects = counterCollection.toArray();
|
||||
Arrays.sort(objects);
|
||||
System.out.println("Length " + objects.length + " ingored words " + statiticsCollectors.getIgnoredCount());
|
||||
for(int i = 0; i < 10; i++){
|
||||
System.out.println(objects[i]);
|
||||
}
|
||||
|
||||
final Evristic evristic = new Evristic();
|
||||
for(int i = 0; i < objects.length; i++){
|
||||
evristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
|
||||
}
|
||||
final AtomicInteger good = new AtomicInteger(0);
|
||||
final AtomicInteger bad = new AtomicInteger(0);
|
||||
final FileWriter writer = new FileWriter("incorret.txt");
|
||||
dirtonaryReader.proccess(new WordProccessor(){
|
||||
public void proccess(WordCard wordCard) throws IOException {
|
||||
for(String wordForm:wordCard.getWordsFroms()){
|
||||
String cf = wordCard.getCanonicalFrom();
|
||||
if (evristic.getNormalForm(wordForm).equals(cf)){
|
||||
good.incrementAndGet();
|
||||
} else{
|
||||
writer.write(wordForm + " c " + cf + " f " + evristic.getNormalForm(wordForm) + "\n");
|
||||
bad.incrementAndGet();
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
writer.close();
|
||||
|
||||
System.out.println("Good " + good + " Bad " + bad);
|
||||
|
||||
evristic.writeToFile("evriticsb");
|
||||
|
||||
|
||||
// Map<String, Set<String>> perehod = new HashMap<String,Set<String>>();
|
||||
// for(SuffixCounter suffixCounter:statiticsCollectors.getStatititics().values()){
|
||||
// String sf = suffixCounter.getSuffixEvristic().getFormSuffix();
|
||||
// Set<String> stringSet = perehod.get(sf);
|
||||
// if (stringSet == null){
|
||||
// stringSet = new HashSet<String>();
|
||||
// perehod.put(sf,stringSet);
|
||||
// }
|
||||
// stringSet.add(suffixCounter.getSuffixEvristic().getNormalSuffix());
|
||||
// //suffix.add(suffixCounter.getSuffixEvristic().getFormSuffix());
|
||||
// //System.out.println(suffixCounter.);
|
||||
// }
|
||||
// System.out.println("Diffirent suffix " + perehod.size());
|
||||
// int c = 0;
|
||||
// int max_size = 0;
|
||||
// int[] size_dist = new int[20];
|
||||
// for(int j = 0; j < size_dist.length; j++) size_dist[j] = 0;
|
||||
// for(Set<String> set:perehod.values()){
|
||||
// size_dist[set.size()] ++;
|
||||
// if (set.size() > 1){
|
||||
// c++;
|
||||
// //System.out.println(set);
|
||||
// }
|
||||
// if(set.size() > max_size) max_size = set.size();
|
||||
// }
|
||||
// System.out.println("max size of diffirent suffix " + max_size + " " + c);
|
||||
// for(int j = 0; j < size_dist.length; j++) System.out.println("" + j + " " + size_dist[j]);
|
||||
}
|
||||
}
|
@ -0,0 +1,30 @@
|
||||
package org.apache.lucene.russian.morphology.analayzer;
|
||||
|
||||
import org.apache.lucene.russian.morphology.evristics.ArrayEvristics;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
public class RussianMorphlogyFilter extends TokenFilter {
|
||||
private ArrayEvristics arrayEvristics;
|
||||
|
||||
protected RussianMorphlogyFilter(TokenStream tokenStream, ArrayEvristics arrayEvristics) {
|
||||
super(tokenStream);
|
||||
this.arrayEvristics = arrayEvristics;
|
||||
}
|
||||
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
return createToken(arrayEvristics.getCanonicalForm(reusableToken.term()), reusableToken, reusableToken);
|
||||
}
|
||||
|
||||
protected Token createToken(String synonym, Token current, final Token reusableToken) {
|
||||
reusableToken.reinit(current, synonym);
|
||||
reusableToken.setTermBuffer(synonym);
|
||||
reusableToken.setPositionIncrement(0);
|
||||
return reusableToken;
|
||||
}
|
||||
}
|
@ -0,0 +1,104 @@
|
||||
package org.apache.lucene.russian.morphology.dictonary;
|
||||
|
||||
import org.apache.lucene.russian.morphology.dictonary.FlexiaModel;
|
||||
import com.frielp.morph.automate.WordImpl;
|
||||
import org.apache.lucene.russian.morphology.evristics.RussianSuffixDecoderEncoder;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
|
||||
public class DirtonaryReader {
|
||||
private String fileName;
|
||||
private String fileEncoding = "windows-1251";
|
||||
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
|
||||
private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
|
||||
private Set<String> ingnoredForm = new HashSet<String>();
|
||||
|
||||
public DirtonaryReader(String fileName, Set<String> ingnoredForm) {
|
||||
this.fileName = fileName;
|
||||
this.ingnoredForm = ingnoredForm;
|
||||
}
|
||||
|
||||
public DirtonaryReader(String fileName, String fileEncoding, Set<String> ingnoredForm) {
|
||||
this.fileName = fileName;
|
||||
this.fileEncoding = fileEncoding;
|
||||
this.ingnoredForm = ingnoredForm;
|
||||
}
|
||||
|
||||
|
||||
public void proccess(WordProccessor wordProccessor) throws IOException {
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), fileEncoding));
|
||||
readFlexias(bufferedReader);
|
||||
sckipBlock(bufferedReader);
|
||||
sckipBlock(bufferedReader);
|
||||
readPrefix(bufferedReader);
|
||||
readWords(bufferedReader,wordProccessor);
|
||||
}
|
||||
|
||||
|
||||
private void readWords(BufferedReader reader,WordProccessor wordProccessor) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.valueOf(s);
|
||||
for (int i = 0; i < count; i++) {
|
||||
s = reader.readLine();
|
||||
if (i % 10000 == 0) System.out.println("Proccess " + i + " word of " + count);
|
||||
|
||||
String[] wd = s.split(" ");
|
||||
String word = wd[0].toLowerCase();
|
||||
if (word.startsWith("-")) continue;
|
||||
word = "#".equals(word) ? "" : word;
|
||||
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
|
||||
if (models.size() > 0 && !ingnoredForm.contains(models.get(0).getCode())) {
|
||||
WordCard card = new WordCard(cleanString(models.get(0).create(word)));
|
||||
for (FlexiaModel fm : models) {
|
||||
card.addFrom(cleanString(fm.create(word)));
|
||||
}
|
||||
wordProccessor.proccess(card);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private String cleanString(String s){
|
||||
return s.replace((char)(34 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET),(char)(6 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
|
||||
}
|
||||
|
||||
private void sckipBlock(BufferedReader reader) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.valueOf(s);
|
||||
for (int i = 0; i < count; i++) {
|
||||
s = reader.readLine();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void readPrefix(BufferedReader reader) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.valueOf(s);
|
||||
for (int i = 0; i < count; i++) {
|
||||
s = reader.readLine();
|
||||
wordPrefixes.add(Arrays.asList(s.toLowerCase().split(",")));
|
||||
}
|
||||
}
|
||||
|
||||
private void readFlexias(BufferedReader reader) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.valueOf(s);
|
||||
for (int i = 0; i < count; i++) {
|
||||
s = reader.readLine();
|
||||
ArrayList<FlexiaModel> flexiaModelArrayList = new ArrayList<FlexiaModel>();
|
||||
wordsFlexias.add(flexiaModelArrayList);
|
||||
for (String line : s.split("%")) {
|
||||
addFlexia(flexiaModelArrayList, line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
|
||||
String[] fl = line.split("\\*");
|
||||
// if (fl.length == 3)
|
||||
// flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
|
||||
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,47 @@
|
||||
package org.apache.lucene.russian.morphology.dictonary;
|
||||
|
||||
|
||||
public class FlexiaModel {
|
||||
private String code;
|
||||
private String suffix;
|
||||
private String prefix;
|
||||
|
||||
public FlexiaModel(String code, String suffix, String prefix) {
|
||||
this.code = code;
|
||||
this.suffix = suffix;
|
||||
this.prefix = prefix;
|
||||
}
|
||||
|
||||
public String getCode() {
|
||||
return code;
|
||||
}
|
||||
|
||||
public void setCode(String code) {
|
||||
this.code = code;
|
||||
}
|
||||
|
||||
public String getSuffix() {
|
||||
return suffix;
|
||||
}
|
||||
|
||||
public void setSuffix(String suffix) {
|
||||
this.suffix = suffix;
|
||||
}
|
||||
|
||||
public String getPrefix() {
|
||||
return prefix;
|
||||
}
|
||||
|
||||
public void setPrefix(String prefix) {
|
||||
this.prefix = prefix;
|
||||
}
|
||||
|
||||
public String create(String s) {
|
||||
return prefix + s + suffix;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return prefix + " " + suffix;
|
||||
}
|
||||
}
|
@ -0,0 +1,38 @@
|
||||
package org.apache.lucene.russian.morphology.dictonary;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
public class IgnoredFormReader {
|
||||
private String fileName;
|
||||
private String fileEncoding = "windows-1251";
|
||||
|
||||
public IgnoredFormReader(String fileName) {
|
||||
this.fileName = fileName;
|
||||
}
|
||||
|
||||
public IgnoredFormReader(String fileName, String fileEncoding) {
|
||||
this.fileName = fileName;
|
||||
this.fileEncoding = fileEncoding;
|
||||
}
|
||||
|
||||
public Set<String> getIngnoredFroms() throws IOException {
|
||||
BufferedReader bufferedReader = new BufferedReader(
|
||||
new InputStreamReader(
|
||||
new FileInputStream(fileName), fileEncoding));
|
||||
String s = bufferedReader.readLine();
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
while (s != null) {
|
||||
if (!s.startsWith("//")) {
|
||||
result.add(s.trim().split(" ")[0]);
|
||||
}
|
||||
s = bufferedReader.readLine();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
package org.apache.lucene.russian.morphology.dictonary;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
|
||||
public class WordCard {
|
||||
private String canonicalFrom;
|
||||
private List<String> wordsFroms = new ArrayList<String>();
|
||||
|
||||
public WordCard(String canonicalFrom) {
|
||||
this.canonicalFrom = canonicalFrom;
|
||||
}
|
||||
|
||||
public void addFrom(String word){
|
||||
wordsFroms.add(word);
|
||||
}
|
||||
|
||||
public String getCanonicalFrom() {
|
||||
return canonicalFrom;
|
||||
}
|
||||
|
||||
public List<String> getWordsFroms() {
|
||||
return wordsFroms;
|
||||
}
|
||||
}
|
@ -0,0 +1,9 @@
|
||||
package org.apache.lucene.russian.morphology.dictonary;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
public interface WordProccessor {
|
||||
|
||||
public void proccess(WordCard wordCard) throws IOException;
|
||||
}
|
@ -0,0 +1,37 @@
|
||||
package org.apache.lucene.russian.morphology.evristics;
|
||||
|
||||
import java.io.FileReader;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
|
||||
public class ArrayEvristics {
|
||||
private long[] keys;
|
||||
private long[] values;
|
||||
|
||||
public void readFromFile(String fileName) throws IOException {
|
||||
BufferedReader reader = new BufferedReader(new FileReader(fileName));
|
||||
int size = Integer.valueOf(reader.readLine());
|
||||
keys = new long[size];
|
||||
values = new long[size];
|
||||
for (int i = 0; i < size; i++) {
|
||||
String[] s = reader.readLine().split(" ");
|
||||
keys[i] = Long.valueOf(s[0]);
|
||||
values[i] = Long.valueOf(s[1]);
|
||||
}
|
||||
}
|
||||
|
||||
public String getCanonicalForm(String form) {
|
||||
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
||||
Long suffix = RussianSuffixDecoderEncoder.encodeLong(form.substring(startSymbol));
|
||||
|
||||
int index = Arrays.binarySearch(keys,suffix);
|
||||
if(index == -1){
|
||||
return form;
|
||||
}else{
|
||||
String nSuffix = RussianSuffixDecoderEncoder.decodeLong(values[index]);
|
||||
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,54 @@
|
||||
package org.apache.lucene.russian.morphology.evristics;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
|
||||
public class Evristic {
|
||||
private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();
|
||||
|
||||
public void addEvristic(SuffixEvristic suffixEvristic) {
|
||||
Long suffix = RussianSuffixDecoderEncoder.encodeLong(suffixEvristic.getFormSuffix());
|
||||
Long longs = encodedSuffixesPairs.get(suffix);
|
||||
if (longs == null) {
|
||||
encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encodeLong(suffixEvristic.getNormalSuffix()));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
public String getNormalForm(String form) {
|
||||
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
||||
Long suffix = RussianSuffixDecoderEncoder.encodeLong(form.substring(startSymbol));
|
||||
|
||||
Long normalSuffix = encodedSuffixesPairs.get(suffix);
|
||||
if (normalSuffix != null) {
|
||||
String nSuffix = RussianSuffixDecoderEncoder.decodeLong(normalSuffix);
|
||||
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
|
||||
|
||||
}
|
||||
return form;
|
||||
}
|
||||
|
||||
public void readFromFile(String file) throws IOException {
|
||||
BufferedReader reader = new BufferedReader(new FileReader(file));
|
||||
String s = reader.readLine();
|
||||
while (s != null) {
|
||||
String[] sfns = s.split(" ");
|
||||
if(sfns.length == 2){
|
||||
encodedSuffixesPairs.put(Long.valueOf(sfns[0]), Long.valueOf(sfns[0]));
|
||||
}
|
||||
s = reader.readLine();
|
||||
}
|
||||
reader.close();
|
||||
}
|
||||
|
||||
public void writeToFile(String file) throws IOException {
|
||||
FileWriter writer = new FileWriter(file);
|
||||
writer.write(encodedSuffixesPairs.size()+"\n");
|
||||
for(Long k:encodedSuffixesPairs.keySet()){
|
||||
writer.write("" + k + " " + encodedSuffixesPairs.get(k) + "\n");
|
||||
}
|
||||
writer.close();
|
||||
}
|
||||
}
|
@ -0,0 +1,6 @@
|
||||
package org.apache.lucene.russian.morphology.evristics;
|
||||
|
||||
|
||||
public class LemmasFreq {
|
||||
|
||||
}
|
@ -0,0 +1,60 @@
|
||||
package org.apache.lucene.russian.morphology.evristics;
|
||||
|
||||
|
||||
public class RussianSuffixDecoderEncoder {
|
||||
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
|
||||
public static final int SUFFIX_LENGTH = 7;
|
||||
|
||||
|
||||
static public Integer encode(String string) {
|
||||
if (string.length() > 6) throw new RuntimeException("suffix to long");
|
||||
int result = 0;
|
||||
for (int i = 0; i < string.length(); i++) {
|
||||
int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
|
||||
if (c < 0) {
|
||||
c = 33;
|
||||
}
|
||||
if (c == 34) c = 6;
|
||||
result = result * 35 + c;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static public String decode(Integer suffixN) {
|
||||
String result = "";
|
||||
while (suffixN > 35) {
|
||||
result = (char) (suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET) + result;
|
||||
suffixN /= 35;
|
||||
}
|
||||
result = (char) (suffixN + RUSSIAN_SMALL_LETTER_OFFSET) + result;
|
||||
return result;
|
||||
}
|
||||
|
||||
static public Long encodeLong(String string) {
|
||||
if (string.length() > 12) throw new RuntimeException("suffix to long");
|
||||
long result = 0L;
|
||||
for (int i = 0; i < string.length(); i++) {
|
||||
int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
|
||||
if (c < 0) {
|
||||
c = 33;
|
||||
}
|
||||
if (c == 34) c = 6;
|
||||
result = result * 35L + c;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static public String decodeLong(Long suffixN) {
|
||||
String result = "";
|
||||
while (suffixN > 35) {
|
||||
long c = suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET;
|
||||
if (c == 33 + RUSSIAN_SMALL_LETTER_OFFSET) c = 45;
|
||||
result = (char) c + result;
|
||||
suffixN /= 35;
|
||||
}
|
||||
long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET;
|
||||
if (c == 33 + RUSSIAN_SMALL_LETTER_OFFSET) c = 45;
|
||||
result = (char) c + result;
|
||||
return result;
|
||||
}
|
||||
}
|
@ -0,0 +1,51 @@
|
||||
package org.apache.lucene.russian.morphology.evristics;
|
||||
|
||||
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
|
||||
import org.apache.lucene.russian.morphology.dictonary.WordCard;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
|
||||
public class StatiticsCollectors implements WordProccessor{
|
||||
Map<SuffixEvristic,SuffixCounter> statititics = new HashMap<SuffixEvristic,SuffixCounter>();
|
||||
|
||||
private Integer ignoredCount = 0;
|
||||
|
||||
public void proccess(WordCard wordCard) {
|
||||
for(String form:wordCard.getWordsFroms()){
|
||||
SuffixEvristic suffixEvristic = createEvristic(wordCard.getCanonicalFrom(), form);
|
||||
if (suffixEvristic == null) continue;
|
||||
SuffixCounter suffixCounter = statititics.get(suffixEvristic);
|
||||
if(suffixCounter == null){
|
||||
suffixCounter = new SuffixCounter(suffixEvristic);
|
||||
statititics.put(suffixEvristic,suffixCounter);
|
||||
}
|
||||
suffixCounter.incrementAmount();
|
||||
}
|
||||
}
|
||||
|
||||
public Map<SuffixEvristic, SuffixCounter> getStatititics() {
|
||||
return statititics;
|
||||
}
|
||||
|
||||
private SuffixEvristic createEvristic(String word,String form){
|
||||
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
||||
String formSuffix = form.substring(startSymbol);
|
||||
if(word.length() < startSymbol){
|
||||
ignoredCount++;
|
||||
return null;
|
||||
}
|
||||
String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : "";
|
||||
if (wordSuffix.length() > 12){
|
||||
System.out.println(word + " " + form);
|
||||
return null;
|
||||
}
|
||||
return new SuffixEvristic(formSuffix,wordSuffix);
|
||||
}
|
||||
|
||||
|
||||
public Integer getIgnoredCount() {
|
||||
return ignoredCount;
|
||||
}
|
||||
}
|
@ -0,0 +1,41 @@
|
||||
package org.apache.lucene.russian.morphology.evristics;
|
||||
|
||||
|
||||
public class SuffixCounter implements Comparable{
|
||||
private SuffixEvristic suffixEvristic;
|
||||
private Double amnout = 0.0;
|
||||
|
||||
public SuffixCounter(SuffixEvristic suffixEvristic) {
|
||||
this.suffixEvristic = suffixEvristic;
|
||||
}
|
||||
|
||||
public void incrementAmount(){
|
||||
amnout++;
|
||||
}
|
||||
|
||||
public SuffixEvristic getSuffixEvristic() {
|
||||
return suffixEvristic;
|
||||
}
|
||||
|
||||
public void setSuffixEvristic(SuffixEvristic suffixEvristic) {
|
||||
this.suffixEvristic = suffixEvristic;
|
||||
}
|
||||
|
||||
public Double getAmnout() {
|
||||
return amnout;
|
||||
}
|
||||
|
||||
public void setAmnout(Double amnout) {
|
||||
this.amnout = amnout;
|
||||
}
|
||||
|
||||
public int compareTo(Object o) {
|
||||
if(o instanceof SuffixCounter) return (int) Math.round(Math.signum(((SuffixCounter)o).amnout - amnout));
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return ""+amnout + " " + suffixEvristic.toString();
|
||||
}
|
||||
}
|
@ -0,0 +1,56 @@
|
||||
package org.apache.lucene.russian.morphology.evristics;
|
||||
|
||||
|
||||
public class SuffixEvristic {
|
||||
private String formSuffix;
|
||||
private String normalSuffix;
|
||||
|
||||
public SuffixEvristic(String formSuffix, String normalSuffix) {
|
||||
this.formSuffix = formSuffix;
|
||||
this.normalSuffix = normalSuffix;
|
||||
}
|
||||
|
||||
public String getFormSuffix() {
|
||||
return formSuffix;
|
||||
}
|
||||
|
||||
public void setFormSuffix(String formSuffix) {
|
||||
this.formSuffix = formSuffix;
|
||||
}
|
||||
|
||||
public String getNormalSuffix() {
|
||||
return normalSuffix;
|
||||
}
|
||||
|
||||
public void setNormalSuffix(String normalSuffix) {
|
||||
this.normalSuffix = normalSuffix;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
SuffixEvristic that = (SuffixEvristic) o;
|
||||
|
||||
if (!formSuffix.equals(that.formSuffix)) return false;
|
||||
if (!normalSuffix.equals(that.normalSuffix)) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = formSuffix.hashCode();
|
||||
result = 31 * result + normalSuffix.hashCode();
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "SuffixEvristic{" +
|
||||
"formSuffix='" + formSuffix + '\'' +
|
||||
", normalSuffix='" + normalSuffix + '\'' +
|
||||
'}';
|
||||
}
|
||||
}
|
38
src/test/java/org/apache/lucene/AppTest.java
Normal file
38
src/test/java/org/apache/lucene/AppTest.java
Normal file
@ -0,0 +1,38 @@
|
||||
package org.apache.lucene;
|
||||
|
||||
import junit.framework.Test;
|
||||
import junit.framework.TestCase;
|
||||
import junit.framework.TestSuite;
|
||||
|
||||
/**
|
||||
* Unit test for simple App.
|
||||
*/
|
||||
public class AppTest
|
||||
extends TestCase
|
||||
{
|
||||
/**
|
||||
* Create the test case
|
||||
*
|
||||
* @param testName name of the test case
|
||||
*/
|
||||
public AppTest( String testName )
|
||||
{
|
||||
super( testName );
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the suite of tests being tested
|
||||
*/
|
||||
public static Test suite()
|
||||
{
|
||||
return new TestSuite( AppTest.class );
|
||||
}
|
||||
|
||||
/**
|
||||
* Rigourous Test :-)
|
||||
*/
|
||||
public void testApp()
|
||||
{
|
||||
assertTrue( true );
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user