git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@3 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
63705d7e3b
commit
b334960f5d
14
pom.xml
14
pom.xml
@ -28,4 +28,18 @@
|
|||||||
<version>2.4.1</version>
|
<version>2.4.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-compiler-plugin</artifactId>
|
||||||
|
<configuration>
|
||||||
|
<source>1.5</source>
|
||||||
|
<target>1.5</target>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
@ -0,0 +1,45 @@
|
|||||||
|
package org.apache.lucene.russian.morphology;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This helper class allow encode suffix of russian word
|
||||||
|
* to long value and decode from it.
|
||||||
|
* Assumed that suffix contains only small russian letters and dash.
|
||||||
|
* Also assumed that letter å and ¸ coinsed.
|
||||||
|
*/
|
||||||
|
public class RussianSuffixDecoderEncoder {
|
||||||
|
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
|
||||||
|
public static final int SUFFIX_LENGTH = 7;
|
||||||
|
private static final int EE_CHAR = 34;
|
||||||
|
private static final int E_CHAR = 6;
|
||||||
|
private static final int DASH_CHAR = 45;
|
||||||
|
private static final int DASH_CODE = 33;
|
||||||
|
|
||||||
|
|
||||||
|
static public Long encode(String string) {
|
||||||
|
if (string.length() > 12) throw new RuntimeException("suffix to long");
|
||||||
|
long result = 0L;
|
||||||
|
for (int i = 0; i < string.length(); i++) {
|
||||||
|
int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
|
||||||
|
if (c < 0) {
|
||||||
|
c = DASH_CODE;
|
||||||
|
}
|
||||||
|
if (c == EE_CHAR) c = E_CHAR;
|
||||||
|
result = result * 35L + c;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static public String decode(Long suffixN) {
|
||||||
|
String result = "";
|
||||||
|
while (suffixN > 35) {
|
||||||
|
long c = suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET;
|
||||||
|
if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR;
|
||||||
|
result = (char) c + result;
|
||||||
|
suffixN /= 35;
|
||||||
|
}
|
||||||
|
long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET;
|
||||||
|
if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR;
|
||||||
|
result = (char) c + result;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
@ -1,6 +1,6 @@
|
|||||||
package org.apache.lucene.russian.morphology;
|
package org.apache.lucene.russian.morphology;
|
||||||
|
|
||||||
import org.apache.lucene.russian.morphology.dictonary.DirtonaryReader;
|
import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
|
||||||
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
|
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
|
||||||
import org.apache.lucene.russian.morphology.dictonary.WordCard;
|
import org.apache.lucene.russian.morphology.dictonary.WordCard;
|
||||||
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
|
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
|
||||||
@ -17,10 +17,10 @@ public class SuffixResearcher {
|
|||||||
public static void main(String[] args) throws IOException {
|
public static void main(String[] args) throws IOException {
|
||||||
IgnoredFormReader formReader = new IgnoredFormReader("igoredFrom.txt");
|
IgnoredFormReader formReader = new IgnoredFormReader("igoredFrom.txt");
|
||||||
Set<String> form = formReader.getIngnoredFroms();
|
Set<String> form = formReader.getIngnoredFroms();
|
||||||
System.out.println(form);
|
|
||||||
DirtonaryReader dirtonaryReader = new DirtonaryReader("morphs.mrd", form);
|
DictonaryReader dictonaryReader = new DictonaryReader("morphs.mrd", form);
|
||||||
StatiticsCollectors statiticsCollectors = new StatiticsCollectors();
|
StatiticsCollectors statiticsCollectors = new StatiticsCollectors();
|
||||||
dirtonaryReader.proccess(statiticsCollectors);
|
dictonaryReader.proccess(statiticsCollectors);
|
||||||
Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
|
Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
|
||||||
Object[] objects = counterCollection.toArray();
|
Object[] objects = counterCollection.toArray();
|
||||||
Arrays.sort(objects);
|
Arrays.sort(objects);
|
||||||
@ -33,10 +33,11 @@ public class SuffixResearcher {
|
|||||||
for(int i = 0; i < objects.length; i++){
|
for(int i = 0; i < objects.length; i++){
|
||||||
evristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
|
evristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
|
||||||
}
|
}
|
||||||
|
|
||||||
final AtomicInteger good = new AtomicInteger(0);
|
final AtomicInteger good = new AtomicInteger(0);
|
||||||
final AtomicInteger bad = new AtomicInteger(0);
|
final AtomicInteger bad = new AtomicInteger(0);
|
||||||
final FileWriter writer = new FileWriter("incorret.txt");
|
final FileWriter writer = new FileWriter("incorret.txt");
|
||||||
dirtonaryReader.proccess(new WordProccessor(){
|
dictonaryReader.proccess(new WordProccessor(){
|
||||||
public void proccess(WordCard wordCard) throws IOException {
|
public void proccess(WordCard wordCard) throws IOException {
|
||||||
for(String wordForm:wordCard.getWordsFroms()){
|
for(String wordForm:wordCard.getWordsFroms()){
|
||||||
String cf = wordCard.getCanonicalFrom();
|
String cf = wordCard.getCanonicalFrom();
|
||||||
@ -54,34 +55,5 @@ public class SuffixResearcher {
|
|||||||
System.out.println("Good " + good + " Bad " + bad);
|
System.out.println("Good " + good + " Bad " + bad);
|
||||||
|
|
||||||
evristic.writeToFile("evriticsb");
|
evristic.writeToFile("evriticsb");
|
||||||
|
|
||||||
|
|
||||||
// Map<String, Set<String>> perehod = new HashMap<String,Set<String>>();
|
|
||||||
// for(SuffixCounter suffixCounter:statiticsCollectors.getStatititics().values()){
|
|
||||||
// String sf = suffixCounter.getSuffixEvristic().getFormSuffix();
|
|
||||||
// Set<String> stringSet = perehod.get(sf);
|
|
||||||
// if (stringSet == null){
|
|
||||||
// stringSet = new HashSet<String>();
|
|
||||||
// perehod.put(sf,stringSet);
|
|
||||||
// }
|
|
||||||
// stringSet.add(suffixCounter.getSuffixEvristic().getNormalSuffix());
|
|
||||||
// //suffix.add(suffixCounter.getSuffixEvristic().getFormSuffix());
|
|
||||||
// //System.out.println(suffixCounter.);
|
|
||||||
// }
|
|
||||||
// System.out.println("Diffirent suffix " + perehod.size());
|
|
||||||
// int c = 0;
|
|
||||||
// int max_size = 0;
|
|
||||||
// int[] size_dist = new int[20];
|
|
||||||
// for(int j = 0; j < size_dist.length; j++) size_dist[j] = 0;
|
|
||||||
// for(Set<String> set:perehod.values()){
|
|
||||||
// size_dist[set.size()] ++;
|
|
||||||
// if (set.size() > 1){
|
|
||||||
// c++;
|
|
||||||
// //System.out.println(set);
|
|
||||||
// }
|
|
||||||
// if(set.size() > max_size) max_size = set.size();
|
|
||||||
// }
|
|
||||||
// System.out.println("max size of diffirent suffix " + max_size + " " + c);
|
|
||||||
// for(int j = 0; j < size_dist.length; j++) System.out.println("" + j + " " + size_dist[j]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
package org.apache.lucene.russian.morphology.evristics;
|
package org.apache.lucene.russian.morphology.analayzer;
|
||||||
|
|
||||||
|
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||||
|
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
@ -24,13 +26,13 @@ public class ArrayEvristics {
|
|||||||
|
|
||||||
public String getCanonicalForm(String form) {
|
public String getCanonicalForm(String form) {
|
||||||
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
||||||
Long suffix = RussianSuffixDecoderEncoder.encodeLong(form.substring(startSymbol));
|
Long suffix = RussianSuffixDecoderEncoder.encode(form.substring(startSymbol));
|
||||||
|
|
||||||
int index = Arrays.binarySearch(keys,suffix);
|
int index = Arrays.binarySearch(keys,suffix);
|
||||||
if(index == -1){
|
if(index == -1){
|
||||||
return form;
|
return form;
|
||||||
}else{
|
}else{
|
||||||
String nSuffix = RussianSuffixDecoderEncoder.decodeLong(values[index]);
|
String nSuffix = RussianSuffixDecoderEncoder.decode(values[index]);
|
||||||
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
|
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -1,6 +1,5 @@
|
|||||||
package org.apache.lucene.russian.morphology.analayzer;
|
package org.apache.lucene.russian.morphology.analayzer;
|
||||||
|
|
||||||
import org.apache.lucene.russian.morphology.evristics.ArrayEvristics;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
@ -1,26 +1,28 @@
|
|||||||
package org.apache.lucene.russian.morphology.dictonary;
|
package org.apache.lucene.russian.morphology.dictonary;
|
||||||
|
|
||||||
import org.apache.lucene.russian.morphology.dictonary.FlexiaModel;
|
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||||
import com.frielp.morph.automate.WordImpl;
|
|
||||||
import org.apache.lucene.russian.morphology.evristics.RussianSuffixDecoderEncoder;
|
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
|
||||||
|
|
||||||
public class DirtonaryReader {
|
/**
|
||||||
|
* This class contain logic how read
|
||||||
|
* dictonary and produce word with it all forms.
|
||||||
|
*/
|
||||||
|
public class DictonaryReader {
|
||||||
private String fileName;
|
private String fileName;
|
||||||
private String fileEncoding = "windows-1251";
|
private String fileEncoding = "windows-1251";
|
||||||
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
|
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
|
||||||
private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
|
private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
|
||||||
private Set<String> ingnoredForm = new HashSet<String>();
|
private Set<String> ingnoredForm = new HashSet<String>();
|
||||||
|
|
||||||
public DirtonaryReader(String fileName, Set<String> ingnoredForm) {
|
public DictonaryReader(String fileName, Set<String> ingnoredForm) {
|
||||||
this.fileName = fileName;
|
this.fileName = fileName;
|
||||||
this.ingnoredForm = ingnoredForm;
|
this.ingnoredForm = ingnoredForm;
|
||||||
}
|
}
|
||||||
|
|
||||||
public DirtonaryReader(String fileName, String fileEncoding, Set<String> ingnoredForm) {
|
public DictonaryReader(String fileName, String fileEncoding, Set<String> ingnoredForm) {
|
||||||
this.fileName = fileName;
|
this.fileName = fileName;
|
||||||
this.fileEncoding = fileEncoding;
|
this.fileEncoding = fileEncoding;
|
||||||
this.ingnoredForm = ingnoredForm;
|
this.ingnoredForm = ingnoredForm;
|
||||||
@ -96,6 +98,7 @@ public class DirtonaryReader {
|
|||||||
|
|
||||||
private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
|
private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
|
||||||
String[] fl = line.split("\\*");
|
String[] fl = line.split("\\*");
|
||||||
|
// we inored all forms thats
|
||||||
// if (fl.length == 3)
|
// if (fl.length == 3)
|
||||||
// flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
|
// flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
|
||||||
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
|
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
|
@ -1,6 +1,8 @@
|
|||||||
package org.apache.lucene.russian.morphology.dictonary;
|
package org.apache.lucene.russian.morphology.dictonary;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Represent inofrmation of how word form created form it imutible part.
|
||||||
|
*/
|
||||||
public class FlexiaModel {
|
public class FlexiaModel {
|
||||||
private String code;
|
private String code;
|
||||||
private String suffix;
|
private String suffix;
|
||||||
|
@ -3,16 +3,18 @@ package org.apache.lucene.russian.morphology.dictonary;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Represent word and all it forms.
|
||||||
|
*/
|
||||||
public class WordCard {
|
public class WordCard {
|
||||||
private String canonicalFrom;
|
private String canonicalFrom;
|
||||||
private List<String> wordsFroms = new ArrayList<String>();
|
private List<String> wordsFroms = new ArrayList<String>();
|
||||||
|
|
||||||
public WordCard(String canonicalFrom) {
|
protected WordCard(String canonicalFrom) {
|
||||||
this.canonicalFrom = canonicalFrom;
|
this.canonicalFrom = canonicalFrom;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addFrom(String word){
|
protected void addFrom(String word){
|
||||||
wordsFroms.add(word);
|
wordsFroms.add(word);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,7 +2,10 @@ package org.apache.lucene.russian.morphology.dictonary;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Interface allows get information from
|
||||||
|
* {@org.apache.lucene.russian.morphology.dictonary.DirtonaryReader}.
|
||||||
|
*/
|
||||||
public interface WordProccessor {
|
public interface WordProccessor {
|
||||||
|
|
||||||
public void proccess(WordCard wordCard) throws IOException;
|
public void proccess(WordCard wordCard) throws IOException;
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
package org.apache.lucene.russian.morphology.evristics;
|
package org.apache.lucene.russian.morphology.evristics;
|
||||||
|
|
||||||
|
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
|
||||||
@ -8,22 +10,20 @@ public class Evristic {
|
|||||||
private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();
|
private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();
|
||||||
|
|
||||||
public void addEvristic(SuffixEvristic suffixEvristic) {
|
public void addEvristic(SuffixEvristic suffixEvristic) {
|
||||||
Long suffix = RussianSuffixDecoderEncoder.encodeLong(suffixEvristic.getFormSuffix());
|
Long suffix = RussianSuffixDecoderEncoder.encode(suffixEvristic.getFormSuffix());
|
||||||
Long longs = encodedSuffixesPairs.get(suffix);
|
Long longs = encodedSuffixesPairs.get(suffix);
|
||||||
if (longs == null) {
|
if (longs == null) {
|
||||||
encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encodeLong(suffixEvristic.getNormalSuffix()));
|
encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixEvristic.getNormalSuffix()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getNormalForm(String form) {
|
public String getNormalForm(String form) {
|
||||||
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
||||||
Long suffix = RussianSuffixDecoderEncoder.encodeLong(form.substring(startSymbol));
|
Long suffix = RussianSuffixDecoderEncoder.encode(form.substring(startSymbol));
|
||||||
|
|
||||||
Long normalSuffix = encodedSuffixesPairs.get(suffix);
|
Long normalSuffix = encodedSuffixesPairs.get(suffix);
|
||||||
if (normalSuffix != null) {
|
if (normalSuffix != null) {
|
||||||
String nSuffix = RussianSuffixDecoderEncoder.decodeLong(normalSuffix);
|
String nSuffix = RussianSuffixDecoderEncoder.decode(normalSuffix);
|
||||||
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
|
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,6 +0,0 @@
|
|||||||
package org.apache.lucene.russian.morphology.evristics;
|
|
||||||
|
|
||||||
|
|
||||||
public class LemmasFreq {
|
|
||||||
|
|
||||||
}
|
|
@ -1,60 +0,0 @@
|
|||||||
package org.apache.lucene.russian.morphology.evristics;
|
|
||||||
|
|
||||||
|
|
||||||
public class RussianSuffixDecoderEncoder {
|
|
||||||
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
|
|
||||||
public static final int SUFFIX_LENGTH = 7;
|
|
||||||
|
|
||||||
|
|
||||||
static public Integer encode(String string) {
|
|
||||||
if (string.length() > 6) throw new RuntimeException("suffix to long");
|
|
||||||
int result = 0;
|
|
||||||
for (int i = 0; i < string.length(); i++) {
|
|
||||||
int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
|
|
||||||
if (c < 0) {
|
|
||||||
c = 33;
|
|
||||||
}
|
|
||||||
if (c == 34) c = 6;
|
|
||||||
result = result * 35 + c;
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
static public String decode(Integer suffixN) {
|
|
||||||
String result = "";
|
|
||||||
while (suffixN > 35) {
|
|
||||||
result = (char) (suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET) + result;
|
|
||||||
suffixN /= 35;
|
|
||||||
}
|
|
||||||
result = (char) (suffixN + RUSSIAN_SMALL_LETTER_OFFSET) + result;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
static public Long encodeLong(String string) {
|
|
||||||
if (string.length() > 12) throw new RuntimeException("suffix to long");
|
|
||||||
long result = 0L;
|
|
||||||
for (int i = 0; i < string.length(); i++) {
|
|
||||||
int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
|
|
||||||
if (c < 0) {
|
|
||||||
c = 33;
|
|
||||||
}
|
|
||||||
if (c == 34) c = 6;
|
|
||||||
result = result * 35L + c;
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
static public String decodeLong(Long suffixN) {
|
|
||||||
String result = "";
|
|
||||||
while (suffixN > 35) {
|
|
||||||
long c = suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET;
|
|
||||||
if (c == 33 + RUSSIAN_SMALL_LETTER_OFFSET) c = 45;
|
|
||||||
result = (char) c + result;
|
|
||||||
suffixN /= 35;
|
|
||||||
}
|
|
||||||
long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET;
|
|
||||||
if (c == 33 + RUSSIAN_SMALL_LETTER_OFFSET) c = 45;
|
|
||||||
result = (char) c + result;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
}
|
|
@ -2,6 +2,7 @@ package org.apache.lucene.russian.morphology.evristics;
|
|||||||
|
|
||||||
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
|
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
|
||||||
import org.apache.lucene.russian.morphology.dictonary.WordCard;
|
import org.apache.lucene.russian.morphology.dictonary.WordCard;
|
||||||
|
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
package org.apache.lucene.russian.morphology.evristics;
|
package org.apache.lucene.russian.morphology.evristics;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Conains information of freqency of suffix evristic
|
||||||
|
* in dictionary.
|
||||||
|
*/
|
||||||
public class SuffixCounter implements Comparable{
|
public class SuffixCounter implements Comparable{
|
||||||
private SuffixEvristic suffixEvristic;
|
private SuffixEvristic suffixEvristic;
|
||||||
private Double amnout = 0.0;
|
private Double amnout = 0.0;
|
||||||
|
@ -1,6 +1,11 @@
|
|||||||
package org.apache.lucene.russian.morphology.evristics;
|
package org.apache.lucene.russian.morphology.evristics;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Represent evristic that assume that
|
||||||
|
* canonical from of word is defined by word suffix.
|
||||||
|
* It contains to suffixes from given position of
|
||||||
|
* canonical word form and for form.
|
||||||
|
*/
|
||||||
public class SuffixEvristic {
|
public class SuffixEvristic {
|
||||||
private String formSuffix;
|
private String formSuffix;
|
||||||
private String normalSuffix;
|
private String normalSuffix;
|
||||||
|
@ -1,38 +0,0 @@
|
|||||||
package org.apache.lucene;
|
|
||||||
|
|
||||||
import junit.framework.Test;
|
|
||||||
import junit.framework.TestCase;
|
|
||||||
import junit.framework.TestSuite;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Unit test for simple App.
|
|
||||||
*/
|
|
||||||
public class AppTest
|
|
||||||
extends TestCase
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* Create the test case
|
|
||||||
*
|
|
||||||
* @param testName name of the test case
|
|
||||||
*/
|
|
||||||
public AppTest( String testName )
|
|
||||||
{
|
|
||||||
super( testName );
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @return the suite of tests being tested
|
|
||||||
*/
|
|
||||||
public static Test suite()
|
|
||||||
{
|
|
||||||
return new TestSuite( AppTest.class );
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Rigourous Test :-)
|
|
||||||
*/
|
|
||||||
public void testApp()
|
|
||||||
{
|
|
||||||
assertTrue( true );
|
|
||||||
}
|
|
||||||
}
|
|
@ -0,0 +1,5 @@
|
|||||||
|
package org.apache.lucene.russian.morphology;
|
||||||
|
|
||||||
|
|
||||||
|
public class RussianSuffixDecoderEncoderTest {
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user