git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@3 d817d54c-26ab-11de-abc9-2f7d1455ff7a

2009-04-11 20:33:25 +00:00 · 2009-04-11 20:33:25 +00:00 · b334960f5d
commit b334960f5d
parent 63705d7e3b
17 changed files with 113 additions and 161 deletions
--- a/pom.xml
+++ b/pom.xml
@ -28,4 +28,18 @@
            <version>2.4.1</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>1.5</source>
                    <target>1.5</target>
                </configuration>
            </plugin>
        </plugins>
    </build>
 </project>
--- a/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java
@ -0,0 +1,45 @@
 package org.apache.lucene.russian.morphology;
 /**
 * This helper class allow encode suffix of russian word
 * to long value and decode from it.
 * Assumed that suffix contains only small russian letters and dash.
 * Also assumed that letter å and ¸ coinsed.
 */
 public class RussianSuffixDecoderEncoder {
    public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
    public static final int SUFFIX_LENGTH = 7;
    private static final int EE_CHAR = 34;
    private static final int E_CHAR = 6;
    private static final int DASH_CHAR = 45;
    private static final int DASH_CODE = 33;
    static public Long encode(String string) {
        if (string.length() > 12) throw new RuntimeException("suffix to long");
        long result = 0L;
        for (int i = 0; i < string.length(); i++) {
            int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
            if (c < 0) {
                c = DASH_CODE;
            }
            if (c == EE_CHAR) c = E_CHAR;
            result = result * 35L + c;
        }
        return result;
    }
    static public String decode(Long suffixN) {
        String result = "";
        while (suffixN > 35) {
            long c = suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET;
            if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR;
            result = (char) c + result;
            suffixN /= 35;
        }
        long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET;
        if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR;
        result = (char) c + result;
        return result;
    }
 }
--- a/src/main/java/org/apache/lucene/russian/morphology/SuffixResearcher.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/SuffixResearcher.java
@ -1,6 +1,6 @@
 package org.apache.lucene.russian.morphology;
-import org.apache.lucene.russian.morphology.dictonary.DirtonaryReader;
+import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
 import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
 import org.apache.lucene.russian.morphology.dictonary.WordCard;
 import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
@ -17,10 +17,10 @@ public class SuffixResearcher {
    public static void main(String[] args) throws IOException {
        IgnoredFormReader formReader = new IgnoredFormReader("igoredFrom.txt");
        Set<String> form = formReader.getIngnoredFroms();
-        System.out.println(form);
+
-        DirtonaryReader dirtonaryReader = new DirtonaryReader("morphs.mrd", form);
+        DictonaryReader dictonaryReader = new DictonaryReader("morphs.mrd", form);
        StatiticsCollectors statiticsCollectors = new StatiticsCollectors();
-        dirtonaryReader.proccess(statiticsCollectors);
+        dictonaryReader.proccess(statiticsCollectors);
        Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
        Object[] objects = counterCollection.toArray();
        Arrays.sort(objects);
@ -33,10 +33,11 @@ public class SuffixResearcher {
        for(int i = 0; i < objects.length; i++){
            evristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
        }
        final AtomicInteger good = new AtomicInteger(0);
        final AtomicInteger bad = new AtomicInteger(0);
        final FileWriter writer = new FileWriter("incorret.txt");
-        dirtonaryReader.proccess(new WordProccessor(){
+        dictonaryReader.proccess(new WordProccessor(){
            public void proccess(WordCard wordCard) throws IOException {
                for(String wordForm:wordCard.getWordsFroms()){
                    String cf = wordCard.getCanonicalFrom();
@ -54,34 +55,5 @@ public class SuffixResearcher {
        System.out.println("Good " + good + " Bad " + bad);
        evristic.writeToFile("evriticsb");
 //        Map<String, Set<String>> perehod = new HashMap<String,Set<String>>();
 //        for(SuffixCounter suffixCounter:statiticsCollectors.getStatititics().values()){
 //            String sf = suffixCounter.getSuffixEvristic().getFormSuffix();
 //            Set<String> stringSet = perehod.get(sf);
 //            if (stringSet == null){
 //                stringSet = new HashSet<String>();
 //                perehod.put(sf,stringSet);
 //            }
 //            stringSet.add(suffixCounter.getSuffixEvristic().getNormalSuffix());
 //            //suffix.add(suffixCounter.getSuffixEvristic().getFormSuffix());
 //            //System.out.println(suffixCounter.);
 //        }
 //        System.out.println("Diffirent suffix " + perehod.size());
 //        int c = 0;
 //        int max_size = 0;
 //        int[] size_dist = new int[20];
 //        for(int j = 0; j < size_dist.length; j++) size_dist[j] = 0;
 //        for(Set<String> set:perehod.values()){
 //            size_dist[set.size()] ++;
 //            if (set.size() > 1){
 //                c++;
 //                //System.out.println(set);
 //            }
 //            if(set.size() > max_size) max_size = set.size();
 //        }
 //        System.out.println("max size of diffirent suffix " + max_size + " " + c);
 //        for(int j = 0; j < size_dist.length; j++) System.out.println("" + j + " " + size_dist[j]);
    }
 }
--- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristics.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristics.java
@ -1,4 +1,6 @@
-package org.apache.lucene.russian.morphology.evristics;
+package org.apache.lucene.russian.morphology.analayzer;
 import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
 import java.io.FileReader;
 import java.io.BufferedReader;
@ -24,13 +26,13 @@ public class ArrayEvristics {
    public String getCanonicalForm(String form) {
        int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
-        Long suffix = RussianSuffixDecoderEncoder.encodeLong(form.substring(startSymbol));
+        Long suffix = RussianSuffixDecoderEncoder.encode(form.substring(startSymbol));
        int index = Arrays.binarySearch(keys,suffix);
        if(index == -1){
            return form;
        }else{
-            String nSuffix = RussianSuffixDecoderEncoder.decodeLong(values[index]);
+            String nSuffix = RussianSuffixDecoderEncoder.decode(values[index]);
            return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
        }
    }
--- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java
@ -1,6 +1,5 @@
 package org.apache.lucene.russian.morphology.analayzer;
 import org.apache.lucene.russian.morphology.evristics.ArrayEvristics;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java
@ -1,26 +1,28 @@
 package org.apache.lucene.russian.morphology.dictonary;
-import org.apache.lucene.russian.morphology.dictonary.FlexiaModel;
+import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
 import com.frielp.morph.automate.WordImpl;
 import org.apache.lucene.russian.morphology.evristics.RussianSuffixDecoderEncoder;
 import java.util.*;
 import java.io.*;
-public class DirtonaryReader {
+/**
 * This class contain logic how read
 * dictonary and produce word with it all forms.
 */
 public class DictonaryReader {
    private String fileName;
    private String fileEncoding = "windows-1251";
    private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
    private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
    private Set<String> ingnoredForm =  new HashSet<String>();
-    public DirtonaryReader(String fileName, Set<String> ingnoredForm) {
+    public DictonaryReader(String fileName, Set<String> ingnoredForm) {
        this.fileName = fileName;
        this.ingnoredForm = ingnoredForm;
    }
-    public DirtonaryReader(String fileName, String fileEncoding, Set<String> ingnoredForm) {
+    public DictonaryReader(String fileName, String fileEncoding, Set<String> ingnoredForm) {
        this.fileName = fileName;
        this.fileEncoding = fileEncoding;
        this.ingnoredForm = ingnoredForm;
@ -96,6 +98,7 @@ public class DirtonaryReader {
    private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
        String[] fl = line.split("\\*");
        // we inored all forms thats
      //  if (fl.length == 3)
      //      flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
        if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java
@ -1,6 +1,8 @@
 package org.apache.lucene.russian.morphology.dictonary;
-
+/**
 * Represent inofrmation of how word form created form it imutible part.
 */
 public class FlexiaModel {
    private String code;
    private String suffix;
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java
@ -3,16 +3,18 @@ package org.apache.lucene.russian.morphology.dictonary;
 import java.util.List;
 import java.util.ArrayList;
-
+/**
 * Represent word and all it forms.
 */
 public class WordCard {
    private String canonicalFrom;
    private List<String> wordsFroms = new ArrayList<String>();
-    public WordCard(String canonicalFrom) {
+    protected WordCard(String canonicalFrom) {
        this.canonicalFrom = canonicalFrom;
    }
-    public void addFrom(String word){
+    protected void addFrom(String word){
        wordsFroms.add(word);
    }
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java
@ -2,7 +2,10 @@ package org.apache.lucene.russian.morphology.dictonary;
 import java.io.IOException;
-
+/**
 * Interface allows get information from 
 * {@org.apache.lucene.russian.morphology.dictonary.DirtonaryReader}.
 */
 public interface WordProccessor {
    public void proccess(WordCard wordCard) throws IOException;
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java
@ -1,5 +1,7 @@
 package org.apache.lucene.russian.morphology.evristics;
 import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
 import java.util.*;
 import java.io.*;
@ -8,22 +10,20 @@ public class Evristic {
    private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();
    public void addEvristic(SuffixEvristic suffixEvristic) {
-        Long suffix = RussianSuffixDecoderEncoder.encodeLong(suffixEvristic.getFormSuffix());
+        Long suffix = RussianSuffixDecoderEncoder.encode(suffixEvristic.getFormSuffix());
        Long longs = encodedSuffixesPairs.get(suffix);
        if (longs == null) {
-            encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encodeLong(suffixEvristic.getNormalSuffix()));
+            encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixEvristic.getNormalSuffix()));
        }
    }
    public String getNormalForm(String form) {
        int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
-        Long suffix = RussianSuffixDecoderEncoder.encodeLong(form.substring(startSymbol));
+        Long suffix = RussianSuffixDecoderEncoder.encode(form.substring(startSymbol));
        Long normalSuffix = encodedSuffixesPairs.get(suffix);
        if (normalSuffix != null) {
-            String nSuffix = RussianSuffixDecoderEncoder.decodeLong(normalSuffix);
+            String nSuffix = RussianSuffixDecoderEncoder.decode(normalSuffix);
            return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
        }
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/LemmasFreq.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/LemmasFreq.java
@ -1,6 +0,0 @@
 package org.apache.lucene.russian.morphology.evristics;
 public class LemmasFreq {
 }
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/RussianSuffixDecoderEncoder.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/RussianSuffixDecoderEncoder.java
@ -1,60 +0,0 @@
 package org.apache.lucene.russian.morphology.evristics;
 public class RussianSuffixDecoderEncoder {
    public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
    public static final int SUFFIX_LENGTH = 7;
    static public Integer encode(String string) {
        if (string.length() > 6) throw new RuntimeException("suffix to long");
        int result = 0;
        for (int i = 0; i < string.length(); i++) {
            int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
            if (c < 0) {
                c = 33;
            }
            if (c == 34) c = 6;
            result = result * 35 + c;
        }
        return result;
    }
    static public String decode(Integer suffixN) {
        String result = "";
        while (suffixN > 35) {
            result = (char) (suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET) + result;
            suffixN /= 35;
        }
        result = (char) (suffixN + RUSSIAN_SMALL_LETTER_OFFSET) + result;
        return result;
    }
    static public Long encodeLong(String string) {
        if (string.length() > 12) throw new RuntimeException("suffix to long");
        long result = 0L;
        for (int i = 0; i < string.length(); i++) {
            int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
            if (c < 0) {
                c = 33;
            }
            if (c == 34) c = 6;
            result = result * 35L + c;
        }
        return result;
    }
    static public String decodeLong(Long suffixN) {
        String result = "";
        while (suffixN > 35) {
            long c = suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET;
            if (c == 33 + RUSSIAN_SMALL_LETTER_OFFSET) c = 45;
            result = (char) c + result;
            suffixN /= 35;
        }
        long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET;
        if (c == 33 + RUSSIAN_SMALL_LETTER_OFFSET) c = 45;
        result = (char) c + result;
        return result;
    }
 }
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java
@ -2,6 +2,7 @@ package org.apache.lucene.russian.morphology.evristics;
 import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
 import org.apache.lucene.russian.morphology.dictonary.WordCard;
 import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
 import java.util.Map;
 import java.util.HashMap;
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java
@ -1,6 +1,9 @@
 package org.apache.lucene.russian.morphology.evristics;
-
+/**
 * Conains information of freqency of suffix evristic
 *  in dictionary.
 */
 public class SuffixCounter implements Comparable{
    private SuffixEvristic suffixEvristic;
    private Double amnout = 0.0;
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java
@ -1,6 +1,11 @@
 package org.apache.lucene.russian.morphology.evristics;
-
+/**
 * Represent evristic that assume that
 * canonical from of word is defined by word suffix.
 * It contains to suffixes from given position of
 * canonical word form and for form.
 */
 public class SuffixEvristic {
    private String formSuffix;
    private String normalSuffix;
--- a/src/test/java/org/apache/lucene/AppTest.java
+++ b/src/test/java/org/apache/lucene/AppTest.java
@ -1,38 +0,0 @@
 package org.apache.lucene;
 import junit.framework.Test;
 import junit.framework.TestCase;
 import junit.framework.TestSuite;
 /**
 * Unit test for simple App.
 */
 public class AppTest 
    extends TestCase
 {
    /**
     * Create the test case
     *
     * @param testName name of the test case
     */
    public AppTest( String testName )
    {
        super( testName );
    }
    /**
     * @return the suite of tests being tested
     */
    public static Test suite()
    {
        return new TestSuite( AppTest.class );
    }
    /**
     * Rigourous Test :-)
     */
    public void testApp()
    {
        assertTrue( true );
    }
 }
--- a/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java
+++ b/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java
@ -0,0 +1,5 @@
 package org.apache.lucene.russian.morphology;
 public class RussianSuffixDecoderEncoderTest {
 }