git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@3 d817d54c-26ab-11de-abc9-2f7d1455ff7a

This commit is contained in:
alexander.a.kuznetsov 2009-04-11 20:33:25 +00:00
parent 63705d7e3b
commit b334960f5d
17 changed files with 113 additions and 161 deletions

14
pom.xml
View File

@ -28,4 +28,18 @@
<version>2.4.1</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.5</source>
<target>1.5</target>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,45 @@
package org.apache.lucene.russian.morphology;
/**
* This helper class allow encode suffix of russian word
* to long value and decode from it.
* Assumed that suffix contains only small russian letters and dash.
* Also assumed that letter å and ¸ coinsed.
*/
public class RussianSuffixDecoderEncoder {
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
public static final int SUFFIX_LENGTH = 7;
private static final int EE_CHAR = 34;
private static final int E_CHAR = 6;
private static final int DASH_CHAR = 45;
private static final int DASH_CODE = 33;
static public Long encode(String string) {
if (string.length() > 12) throw new RuntimeException("suffix to long");
long result = 0L;
for (int i = 0; i < string.length(); i++) {
int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
if (c < 0) {
c = DASH_CODE;
}
if (c == EE_CHAR) c = E_CHAR;
result = result * 35L + c;
}
return result;
}
static public String decode(Long suffixN) {
String result = "";
while (suffixN > 35) {
long c = suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET;
if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR;
result = (char) c + result;
suffixN /= 35;
}
long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET;
if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR;
result = (char) c + result;
return result;
}
}

View File

@ -1,6 +1,6 @@
package org.apache.lucene.russian.morphology;
import org.apache.lucene.russian.morphology.dictonary.DirtonaryReader;
import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
import org.apache.lucene.russian.morphology.dictonary.WordCard;
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
@ -17,10 +17,10 @@ public class SuffixResearcher {
public static void main(String[] args) throws IOException {
IgnoredFormReader formReader = new IgnoredFormReader("igoredFrom.txt");
Set<String> form = formReader.getIngnoredFroms();
System.out.println(form);
DirtonaryReader dirtonaryReader = new DirtonaryReader("morphs.mrd", form);
DictonaryReader dictonaryReader = new DictonaryReader("morphs.mrd", form);
StatiticsCollectors statiticsCollectors = new StatiticsCollectors();
dirtonaryReader.proccess(statiticsCollectors);
dictonaryReader.proccess(statiticsCollectors);
Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
Object[] objects = counterCollection.toArray();
Arrays.sort(objects);
@ -33,10 +33,11 @@ public class SuffixResearcher {
for(int i = 0; i < objects.length; i++){
evristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
}
final AtomicInteger good = new AtomicInteger(0);
final AtomicInteger bad = new AtomicInteger(0);
final FileWriter writer = new FileWriter("incorret.txt");
dirtonaryReader.proccess(new WordProccessor(){
dictonaryReader.proccess(new WordProccessor(){
public void proccess(WordCard wordCard) throws IOException {
for(String wordForm:wordCard.getWordsFroms()){
String cf = wordCard.getCanonicalFrom();
@ -54,34 +55,5 @@ public class SuffixResearcher {
System.out.println("Good " + good + " Bad " + bad);
evristic.writeToFile("evriticsb");
// Map<String, Set<String>> perehod = new HashMap<String,Set<String>>();
// for(SuffixCounter suffixCounter:statiticsCollectors.getStatititics().values()){
// String sf = suffixCounter.getSuffixEvristic().getFormSuffix();
// Set<String> stringSet = perehod.get(sf);
// if (stringSet == null){
// stringSet = new HashSet<String>();
// perehod.put(sf,stringSet);
// }
// stringSet.add(suffixCounter.getSuffixEvristic().getNormalSuffix());
// //suffix.add(suffixCounter.getSuffixEvristic().getFormSuffix());
// //System.out.println(suffixCounter.);
// }
// System.out.println("Diffirent suffix " + perehod.size());
// int c = 0;
// int max_size = 0;
// int[] size_dist = new int[20];
// for(int j = 0; j < size_dist.length; j++) size_dist[j] = 0;
// for(Set<String> set:perehod.values()){
// size_dist[set.size()] ++;
// if (set.size() > 1){
// c++;
// //System.out.println(set);
// }
// if(set.size() > max_size) max_size = set.size();
// }
// System.out.println("max size of diffirent suffix " + max_size + " " + c);
// for(int j = 0; j < size_dist.length; j++) System.out.println("" + j + " " + size_dist[j]);
}
}

View File

@ -1,4 +1,6 @@
package org.apache.lucene.russian.morphology.evristics;
package org.apache.lucene.russian.morphology.analayzer;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import java.io.FileReader;
import java.io.BufferedReader;
@ -24,13 +26,13 @@ public class ArrayEvristics {
public String getCanonicalForm(String form) {
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
Long suffix = RussianSuffixDecoderEncoder.encodeLong(form.substring(startSymbol));
Long suffix = RussianSuffixDecoderEncoder.encode(form.substring(startSymbol));
int index = Arrays.binarySearch(keys,suffix);
if(index == -1){
return form;
}else{
String nSuffix = RussianSuffixDecoderEncoder.decodeLong(values[index]);
String nSuffix = RussianSuffixDecoderEncoder.decode(values[index]);
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
}
}

View File

@ -1,6 +1,5 @@
package org.apache.lucene.russian.morphology.analayzer;
import org.apache.lucene.russian.morphology.evristics.ArrayEvristics;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;

View File

@ -1,26 +1,28 @@
package org.apache.lucene.russian.morphology.dictonary;
import org.apache.lucene.russian.morphology.dictonary.FlexiaModel;
import com.frielp.morph.automate.WordImpl;
import org.apache.lucene.russian.morphology.evristics.RussianSuffixDecoderEncoder;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import java.util.*;
import java.io.*;
public class DirtonaryReader {
/**
* This class contain logic how read
* dictonary and produce word with it all forms.
*/
public class DictonaryReader {
private String fileName;
private String fileEncoding = "windows-1251";
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
private Set<String> ingnoredForm = new HashSet<String>();
public DirtonaryReader(String fileName, Set<String> ingnoredForm) {
public DictonaryReader(String fileName, Set<String> ingnoredForm) {
this.fileName = fileName;
this.ingnoredForm = ingnoredForm;
}
public DirtonaryReader(String fileName, String fileEncoding, Set<String> ingnoredForm) {
public DictonaryReader(String fileName, String fileEncoding, Set<String> ingnoredForm) {
this.fileName = fileName;
this.fileEncoding = fileEncoding;
this.ingnoredForm = ingnoredForm;
@ -96,6 +98,7 @@ public class DirtonaryReader {
private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
String[] fl = line.split("\\*");
// we inored all forms thats
// if (fl.length == 3)
// flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));

View File

@ -1,6 +1,8 @@
package org.apache.lucene.russian.morphology.dictonary;
/**
* Represent inofrmation of how word form created form it imutible part.
*/
public class FlexiaModel {
private String code;
private String suffix;

View File

@ -3,16 +3,18 @@ package org.apache.lucene.russian.morphology.dictonary;
import java.util.List;
import java.util.ArrayList;
/**
* Represent word and all it forms.
*/
public class WordCard {
private String canonicalFrom;
private List<String> wordsFroms = new ArrayList<String>();
public WordCard(String canonicalFrom) {
protected WordCard(String canonicalFrom) {
this.canonicalFrom = canonicalFrom;
}
public void addFrom(String word){
protected void addFrom(String word){
wordsFroms.add(word);
}

View File

@ -2,7 +2,10 @@ package org.apache.lucene.russian.morphology.dictonary;
import java.io.IOException;
/**
* Interface allows get information from
* {@org.apache.lucene.russian.morphology.dictonary.DirtonaryReader}.
*/
public interface WordProccessor {
public void proccess(WordCard wordCard) throws IOException;

View File

@ -1,5 +1,7 @@
package org.apache.lucene.russian.morphology.evristics;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import java.util.*;
import java.io.*;
@ -8,22 +10,20 @@ public class Evristic {
private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();
public void addEvristic(SuffixEvristic suffixEvristic) {
Long suffix = RussianSuffixDecoderEncoder.encodeLong(suffixEvristic.getFormSuffix());
Long suffix = RussianSuffixDecoderEncoder.encode(suffixEvristic.getFormSuffix());
Long longs = encodedSuffixesPairs.get(suffix);
if (longs == null) {
encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encodeLong(suffixEvristic.getNormalSuffix()));
encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixEvristic.getNormalSuffix()));
}
}
public String getNormalForm(String form) {
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
Long suffix = RussianSuffixDecoderEncoder.encodeLong(form.substring(startSymbol));
Long suffix = RussianSuffixDecoderEncoder.encode(form.substring(startSymbol));
Long normalSuffix = encodedSuffixesPairs.get(suffix);
if (normalSuffix != null) {
String nSuffix = RussianSuffixDecoderEncoder.decodeLong(normalSuffix);
String nSuffix = RussianSuffixDecoderEncoder.decode(normalSuffix);
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
}

View File

@ -1,6 +0,0 @@
package org.apache.lucene.russian.morphology.evristics;
public class LemmasFreq {
}

View File

@ -1,60 +0,0 @@
package org.apache.lucene.russian.morphology.evristics;
public class RussianSuffixDecoderEncoder {
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
public static final int SUFFIX_LENGTH = 7;
static public Integer encode(String string) {
if (string.length() > 6) throw new RuntimeException("suffix to long");
int result = 0;
for (int i = 0; i < string.length(); i++) {
int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
if (c < 0) {
c = 33;
}
if (c == 34) c = 6;
result = result * 35 + c;
}
return result;
}
static public String decode(Integer suffixN) {
String result = "";
while (suffixN > 35) {
result = (char) (suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET) + result;
suffixN /= 35;
}
result = (char) (suffixN + RUSSIAN_SMALL_LETTER_OFFSET) + result;
return result;
}
static public Long encodeLong(String string) {
if (string.length() > 12) throw new RuntimeException("suffix to long");
long result = 0L;
for (int i = 0; i < string.length(); i++) {
int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
if (c < 0) {
c = 33;
}
if (c == 34) c = 6;
result = result * 35L + c;
}
return result;
}
static public String decodeLong(Long suffixN) {
String result = "";
while (suffixN > 35) {
long c = suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET;
if (c == 33 + RUSSIAN_SMALL_LETTER_OFFSET) c = 45;
result = (char) c + result;
suffixN /= 35;
}
long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET;
if (c == 33 + RUSSIAN_SMALL_LETTER_OFFSET) c = 45;
result = (char) c + result;
return result;
}
}

View File

@ -2,6 +2,7 @@ package org.apache.lucene.russian.morphology.evristics;
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
import org.apache.lucene.russian.morphology.dictonary.WordCard;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import java.util.Map;
import java.util.HashMap;

View File

@ -1,6 +1,9 @@
package org.apache.lucene.russian.morphology.evristics;
/**
* Conains information of freqency of suffix evristic
* in dictionary.
*/
public class SuffixCounter implements Comparable{
private SuffixEvristic suffixEvristic;
private Double amnout = 0.0;

View File

@ -1,6 +1,11 @@
package org.apache.lucene.russian.morphology.evristics;
/**
* Represent evristic that assume that
* canonical from of word is defined by word suffix.
* It contains to suffixes from given position of
* canonical word form and for form.
*/
public class SuffixEvristic {
private String formSuffix;
private String normalSuffix;

View File

@ -1,38 +0,0 @@
package org.apache.lucene;
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
/**
* Unit test for simple App.
*/
public class AppTest
extends TestCase
{
/**
* Create the test case
*
* @param testName name of the test case
*/
public AppTest( String testName )
{
super( testName );
}
/**
* @return the suite of tests being tested
*/
public static Test suite()
{
return new TestSuite( AppTest.class );
}
/**
* Rigourous Test :-)
*/
public void testApp()
{
assertTrue( true );
}
}

View File

@ -0,0 +1,5 @@
package org.apache.lucene.russian.morphology;
public class RussianSuffixDecoderEncoderTest {
}