first commit

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@2 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov
2009-04-11 19:45:38 +00:00
parent b93b764296
commit 63705d7e3b
35 changed files with 200510 additions and 0 deletions

View File

@ -0,0 +1,13 @@
package org.apache.lucene;
/**
* Hello world!
*
*/
public class App
{
public static void main( String[] args )
{
System.out.println( "Hello World!" );
}
}

View File

@ -0,0 +1,87 @@
package org.apache.lucene.russian.morphology;
import org.apache.lucene.russian.morphology.dictonary.DirtonaryReader;
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
import org.apache.lucene.russian.morphology.dictonary.WordCard;
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
import org.apache.lucene.russian.morphology.evristics.StatiticsCollectors;
import org.apache.lucene.russian.morphology.evristics.SuffixCounter;
import org.apache.lucene.russian.morphology.evristics.Evristic;
import java.io.*;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
public class SuffixResearcher {
public static void main(String[] args) throws IOException {
IgnoredFormReader formReader = new IgnoredFormReader("igoredFrom.txt");
Set<String> form = formReader.getIngnoredFroms();
System.out.println(form);
DirtonaryReader dirtonaryReader = new DirtonaryReader("morphs.mrd", form);
StatiticsCollectors statiticsCollectors = new StatiticsCollectors();
dirtonaryReader.proccess(statiticsCollectors);
Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
Object[] objects = counterCollection.toArray();
Arrays.sort(objects);
System.out.println("Length " + objects.length + " ingored words " + statiticsCollectors.getIgnoredCount());
for(int i = 0; i < 10; i++){
System.out.println(objects[i]);
}
final Evristic evristic = new Evristic();
for(int i = 0; i < objects.length; i++){
evristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
}
final AtomicInteger good = new AtomicInteger(0);
final AtomicInteger bad = new AtomicInteger(0);
final FileWriter writer = new FileWriter("incorret.txt");
dirtonaryReader.proccess(new WordProccessor(){
public void proccess(WordCard wordCard) throws IOException {
for(String wordForm:wordCard.getWordsFroms()){
String cf = wordCard.getCanonicalFrom();
if (evristic.getNormalForm(wordForm).equals(cf)){
good.incrementAndGet();
} else{
writer.write(wordForm + " c " + cf + " f " + evristic.getNormalForm(wordForm) + "\n");
bad.incrementAndGet();
}
}
}
});
writer.close();
System.out.println("Good " + good + " Bad " + bad);
evristic.writeToFile("evriticsb");
// Map<String, Set<String>> perehod = new HashMap<String,Set<String>>();
// for(SuffixCounter suffixCounter:statiticsCollectors.getStatititics().values()){
// String sf = suffixCounter.getSuffixEvristic().getFormSuffix();
// Set<String> stringSet = perehod.get(sf);
// if (stringSet == null){
// stringSet = new HashSet<String>();
// perehod.put(sf,stringSet);
// }
// stringSet.add(suffixCounter.getSuffixEvristic().getNormalSuffix());
// //suffix.add(suffixCounter.getSuffixEvristic().getFormSuffix());
// //System.out.println(suffixCounter.);
// }
// System.out.println("Diffirent suffix " + perehod.size());
// int c = 0;
// int max_size = 0;
// int[] size_dist = new int[20];
// for(int j = 0; j < size_dist.length; j++) size_dist[j] = 0;
// for(Set<String> set:perehod.values()){
// size_dist[set.size()] ++;
// if (set.size() > 1){
// c++;
// //System.out.println(set);
// }
// if(set.size() > max_size) max_size = set.size();
// }
// System.out.println("max size of diffirent suffix " + max_size + " " + c);
// for(int j = 0; j < size_dist.length; j++) System.out.println("" + j + " " + size_dist[j]);
}
}

View File

@ -0,0 +1,30 @@
package org.apache.lucene.russian.morphology.analayzer;
import org.apache.lucene.russian.morphology.evristics.ArrayEvristics;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
public class RussianMorphlogyFilter extends TokenFilter {
private ArrayEvristics arrayEvristics;
protected RussianMorphlogyFilter(TokenStream tokenStream, ArrayEvristics arrayEvristics) {
super(tokenStream);
this.arrayEvristics = arrayEvristics;
}
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
return createToken(arrayEvristics.getCanonicalForm(reusableToken.term()), reusableToken, reusableToken);
}
protected Token createToken(String synonym, Token current, final Token reusableToken) {
reusableToken.reinit(current, synonym);
reusableToken.setTermBuffer(synonym);
reusableToken.setPositionIncrement(0);
return reusableToken;
}
}

View File

@ -0,0 +1,104 @@
package org.apache.lucene.russian.morphology.dictonary;
import org.apache.lucene.russian.morphology.dictonary.FlexiaModel;
import com.frielp.morph.automate.WordImpl;
import org.apache.lucene.russian.morphology.evristics.RussianSuffixDecoderEncoder;
import java.util.*;
import java.io.*;
public class DirtonaryReader {
private String fileName;
private String fileEncoding = "windows-1251";
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
private Set<String> ingnoredForm = new HashSet<String>();
public DirtonaryReader(String fileName, Set<String> ingnoredForm) {
this.fileName = fileName;
this.ingnoredForm = ingnoredForm;
}
public DirtonaryReader(String fileName, String fileEncoding, Set<String> ingnoredForm) {
this.fileName = fileName;
this.fileEncoding = fileEncoding;
this.ingnoredForm = ingnoredForm;
}
public void proccess(WordProccessor wordProccessor) throws IOException {
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), fileEncoding));
readFlexias(bufferedReader);
sckipBlock(bufferedReader);
sckipBlock(bufferedReader);
readPrefix(bufferedReader);
readWords(bufferedReader,wordProccessor);
}
private void readWords(BufferedReader reader,WordProccessor wordProccessor) throws IOException {
String s = reader.readLine();
int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) {
s = reader.readLine();
if (i % 10000 == 0) System.out.println("Proccess " + i + " word of " + count);
String[] wd = s.split(" ");
String word = wd[0].toLowerCase();
if (word.startsWith("-")) continue;
word = "#".equals(word) ? "" : word;
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
if (models.size() > 0 && !ingnoredForm.contains(models.get(0).getCode())) {
WordCard card = new WordCard(cleanString(models.get(0).create(word)));
for (FlexiaModel fm : models) {
card.addFrom(cleanString(fm.create(word)));
}
wordProccessor.proccess(card);
}
}
}
private String cleanString(String s){
return s.replace((char)(34 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET),(char)(6 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
}
private void sckipBlock(BufferedReader reader) throws IOException {
String s = reader.readLine();
int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) {
s = reader.readLine();
}
}
private void readPrefix(BufferedReader reader) throws IOException {
String s = reader.readLine();
int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) {
s = reader.readLine();
wordPrefixes.add(Arrays.asList(s.toLowerCase().split(",")));
}
}
private void readFlexias(BufferedReader reader) throws IOException {
String s = reader.readLine();
int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) {
s = reader.readLine();
ArrayList<FlexiaModel> flexiaModelArrayList = new ArrayList<FlexiaModel>();
wordsFlexias.add(flexiaModelArrayList);
for (String line : s.split("%")) {
addFlexia(flexiaModelArrayList, line);
}
}
}
private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
String[] fl = line.split("\\*");
// if (fl.length == 3)
// flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
}
}

View File

@ -0,0 +1,47 @@
package org.apache.lucene.russian.morphology.dictonary;
public class FlexiaModel {
private String code;
private String suffix;
private String prefix;
public FlexiaModel(String code, String suffix, String prefix) {
this.code = code;
this.suffix = suffix;
this.prefix = prefix;
}
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
public String getSuffix() {
return suffix;
}
public void setSuffix(String suffix) {
this.suffix = suffix;
}
public String getPrefix() {
return prefix;
}
public void setPrefix(String prefix) {
this.prefix = prefix;
}
public String create(String s) {
return prefix + s + suffix;
}
@Override
public String toString() {
return prefix + " " + suffix;
}
}

View File

@ -0,0 +1,38 @@
package org.apache.lucene.russian.morphology.dictonary;
import java.util.Set;
import java.util.HashSet;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.io.IOException;
public class IgnoredFormReader {
private String fileName;
private String fileEncoding = "windows-1251";
public IgnoredFormReader(String fileName) {
this.fileName = fileName;
}
public IgnoredFormReader(String fileName, String fileEncoding) {
this.fileName = fileName;
this.fileEncoding = fileEncoding;
}
public Set<String> getIngnoredFroms() throws IOException {
BufferedReader bufferedReader = new BufferedReader(
new InputStreamReader(
new FileInputStream(fileName), fileEncoding));
String s = bufferedReader.readLine();
HashSet<String> result = new HashSet<String>();
while (s != null) {
if (!s.startsWith("//")) {
result.add(s.trim().split(" ")[0]);
}
s = bufferedReader.readLine();
}
return result;
}
}

View File

@ -0,0 +1,26 @@
package org.apache.lucene.russian.morphology.dictonary;
import java.util.List;
import java.util.ArrayList;
public class WordCard {
private String canonicalFrom;
private List<String> wordsFroms = new ArrayList<String>();
public WordCard(String canonicalFrom) {
this.canonicalFrom = canonicalFrom;
}
public void addFrom(String word){
wordsFroms.add(word);
}
public String getCanonicalFrom() {
return canonicalFrom;
}
public List<String> getWordsFroms() {
return wordsFroms;
}
}

View File

@ -0,0 +1,9 @@
package org.apache.lucene.russian.morphology.dictonary;
import java.io.IOException;
public interface WordProccessor {
public void proccess(WordCard wordCard) throws IOException;
}

View File

@ -0,0 +1,37 @@
package org.apache.lucene.russian.morphology.evristics;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.Arrays;
public class ArrayEvristics {
private long[] keys;
private long[] values;
public void readFromFile(String fileName) throws IOException {
BufferedReader reader = new BufferedReader(new FileReader(fileName));
int size = Integer.valueOf(reader.readLine());
keys = new long[size];
values = new long[size];
for (int i = 0; i < size; i++) {
String[] s = reader.readLine().split(" ");
keys[i] = Long.valueOf(s[0]);
values[i] = Long.valueOf(s[1]);
}
}
public String getCanonicalForm(String form) {
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
Long suffix = RussianSuffixDecoderEncoder.encodeLong(form.substring(startSymbol));
int index = Arrays.binarySearch(keys,suffix);
if(index == -1){
return form;
}else{
String nSuffix = RussianSuffixDecoderEncoder.decodeLong(values[index]);
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
}
}
}

View File

@ -0,0 +1,54 @@
package org.apache.lucene.russian.morphology.evristics;
import java.util.*;
import java.io.*;
public class Evristic {
private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();
public void addEvristic(SuffixEvristic suffixEvristic) {
Long suffix = RussianSuffixDecoderEncoder.encodeLong(suffixEvristic.getFormSuffix());
Long longs = encodedSuffixesPairs.get(suffix);
if (longs == null) {
encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encodeLong(suffixEvristic.getNormalSuffix()));
}
}
public String getNormalForm(String form) {
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
Long suffix = RussianSuffixDecoderEncoder.encodeLong(form.substring(startSymbol));
Long normalSuffix = encodedSuffixesPairs.get(suffix);
if (normalSuffix != null) {
String nSuffix = RussianSuffixDecoderEncoder.decodeLong(normalSuffix);
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
}
return form;
}
public void readFromFile(String file) throws IOException {
BufferedReader reader = new BufferedReader(new FileReader(file));
String s = reader.readLine();
while (s != null) {
String[] sfns = s.split(" ");
if(sfns.length == 2){
encodedSuffixesPairs.put(Long.valueOf(sfns[0]), Long.valueOf(sfns[0]));
}
s = reader.readLine();
}
reader.close();
}
public void writeToFile(String file) throws IOException {
FileWriter writer = new FileWriter(file);
writer.write(encodedSuffixesPairs.size()+"\n");
for(Long k:encodedSuffixesPairs.keySet()){
writer.write("" + k + " " + encodedSuffixesPairs.get(k) + "\n");
}
writer.close();
}
}

View File

@ -0,0 +1,6 @@
package org.apache.lucene.russian.morphology.evristics;
public class LemmasFreq {
}

View File

@ -0,0 +1,60 @@
package org.apache.lucene.russian.morphology.evristics;
public class RussianSuffixDecoderEncoder {
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
public static final int SUFFIX_LENGTH = 7;
static public Integer encode(String string) {
if (string.length() > 6) throw new RuntimeException("suffix to long");
int result = 0;
for (int i = 0; i < string.length(); i++) {
int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
if (c < 0) {
c = 33;
}
if (c == 34) c = 6;
result = result * 35 + c;
}
return result;
}
static public String decode(Integer suffixN) {
String result = "";
while (suffixN > 35) {
result = (char) (suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET) + result;
suffixN /= 35;
}
result = (char) (suffixN + RUSSIAN_SMALL_LETTER_OFFSET) + result;
return result;
}
static public Long encodeLong(String string) {
if (string.length() > 12) throw new RuntimeException("suffix to long");
long result = 0L;
for (int i = 0; i < string.length(); i++) {
int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
if (c < 0) {
c = 33;
}
if (c == 34) c = 6;
result = result * 35L + c;
}
return result;
}
static public String decodeLong(Long suffixN) {
String result = "";
while (suffixN > 35) {
long c = suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET;
if (c == 33 + RUSSIAN_SMALL_LETTER_OFFSET) c = 45;
result = (char) c + result;
suffixN /= 35;
}
long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET;
if (c == 33 + RUSSIAN_SMALL_LETTER_OFFSET) c = 45;
result = (char) c + result;
return result;
}
}

View File

@ -0,0 +1,51 @@
package org.apache.lucene.russian.morphology.evristics;
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
import org.apache.lucene.russian.morphology.dictonary.WordCard;
import java.util.Map;
import java.util.HashMap;
public class StatiticsCollectors implements WordProccessor{
Map<SuffixEvristic,SuffixCounter> statititics = new HashMap<SuffixEvristic,SuffixCounter>();
private Integer ignoredCount = 0;
public void proccess(WordCard wordCard) {
for(String form:wordCard.getWordsFroms()){
SuffixEvristic suffixEvristic = createEvristic(wordCard.getCanonicalFrom(), form);
if (suffixEvristic == null) continue;
SuffixCounter suffixCounter = statititics.get(suffixEvristic);
if(suffixCounter == null){
suffixCounter = new SuffixCounter(suffixEvristic);
statititics.put(suffixEvristic,suffixCounter);
}
suffixCounter.incrementAmount();
}
}
public Map<SuffixEvristic, SuffixCounter> getStatititics() {
return statititics;
}
private SuffixEvristic createEvristic(String word,String form){
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
String formSuffix = form.substring(startSymbol);
if(word.length() < startSymbol){
ignoredCount++;
return null;
}
String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : "";
if (wordSuffix.length() > 12){
System.out.println(word + " " + form);
return null;
}
return new SuffixEvristic(formSuffix,wordSuffix);
}
public Integer getIgnoredCount() {
return ignoredCount;
}
}

View File

@ -0,0 +1,41 @@
package org.apache.lucene.russian.morphology.evristics;
public class SuffixCounter implements Comparable{
private SuffixEvristic suffixEvristic;
private Double amnout = 0.0;
public SuffixCounter(SuffixEvristic suffixEvristic) {
this.suffixEvristic = suffixEvristic;
}
public void incrementAmount(){
amnout++;
}
public SuffixEvristic getSuffixEvristic() {
return suffixEvristic;
}
public void setSuffixEvristic(SuffixEvristic suffixEvristic) {
this.suffixEvristic = suffixEvristic;
}
public Double getAmnout() {
return amnout;
}
public void setAmnout(Double amnout) {
this.amnout = amnout;
}
public int compareTo(Object o) {
if(o instanceof SuffixCounter) return (int) Math.round(Math.signum(((SuffixCounter)o).amnout - amnout));
return -1;
}
@Override
public String toString() {
return ""+amnout + " " + suffixEvristic.toString();
}
}

View File

@ -0,0 +1,56 @@
package org.apache.lucene.russian.morphology.evristics;
public class SuffixEvristic {
private String formSuffix;
private String normalSuffix;
public SuffixEvristic(String formSuffix, String normalSuffix) {
this.formSuffix = formSuffix;
this.normalSuffix = normalSuffix;
}
public String getFormSuffix() {
return formSuffix;
}
public void setFormSuffix(String formSuffix) {
this.formSuffix = formSuffix;
}
public String getNormalSuffix() {
return normalSuffix;
}
public void setNormalSuffix(String normalSuffix) {
this.normalSuffix = normalSuffix;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
SuffixEvristic that = (SuffixEvristic) o;
if (!formSuffix.equals(that.formSuffix)) return false;
if (!normalSuffix.equals(that.normalSuffix)) return false;
return true;
}
@Override
public int hashCode() {
int result = formSuffix.hashCode();
result = 31 * result + normalSuffix.hashCode();
return result;
}
@Override
public String toString() {
return "SuffixEvristic{" +
"formSuffix='" + formSuffix + '\'' +
", normalSuffix='" + normalSuffix + '\'' +
'}';
}
}

View File

@ -0,0 +1,38 @@
package org.apache.lucene;
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
/**
* Unit test for simple App.
*/
public class AppTest
extends TestCase
{
/**
* Create the test case
*
* @param testName name of the test case
*/
public AppTest( String testName )
{
super( testName );
}
/**
* @return the suite of tests being tested
*/
public static Test suite()
{
return new TestSuite( AppTest.class );
}
/**
* Rigourous Test :-)
*/
public void testApp()
{
assertTrue( true );
}
}