git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@3 d817d54c-26ab-11de-abc9-2f7d1455ff7a

2009-04-11 20:33:25 +00:00
parent 63705d7e3b
commit b334960f5d
17 changed files with 113 additions and 161 deletions
--- a/pom.xml
+++ b/pom.xml
@@ -28,4 +28,18 @@
            <version>2.4.1</version>
        </dependency>
    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <configuration>
+                    <source>1.5</source>
+                    <target>1.5</target>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+
 </project>
--- a/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java
@@ -0,0 +1,45 @@
+package org.apache.lucene.russian.morphology;
+
+/**
+ * This helper class allow encode suffix of russian word
+ * to long value and decode from it.
+ * Assumed that suffix contains only small russian letters and dash.
+ * Also assumed that letter <20> and <20> coinsed.
+ */
+public class RussianSuffixDecoderEncoder {
+    public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
+    public static final int SUFFIX_LENGTH = 7;
+    private static final int EE_CHAR = 34;
+    private static final int E_CHAR = 6;
+    private static final int DASH_CHAR = 45;
+    private static final int DASH_CODE = 33;
+
+
+    static public Long encode(String string) {
+        if (string.length() > 12) throw new RuntimeException("suffix to long");
+        long result = 0L;
+        for (int i = 0; i < string.length(); i++) {
+            int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
+            if (c < 0) {
+                c = DASH_CODE;
+            }
+            if (c == EE_CHAR) c = E_CHAR;
+            result = result * 35L + c;
+        }
+        return result;
+    }
+
+    static public String decode(Long suffixN) {
+        String result = "";
+        while (suffixN > 35) {
+            long c = suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET;
+            if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR;
+            result = (char) c + result;
+            suffixN /= 35;
+        }
+        long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET;
+        if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR;
+        result = (char) c + result;
+        return result;
+    }
+}
--- a/src/main/java/org/apache/lucene/russian/morphology/SuffixResearcher.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/SuffixResearcher.java
@@ -1,6 +1,6 @@
 package org.apache.lucene.russian.morphology;

-import org.apache.lucene.russian.morphology.dictonary.DirtonaryReader;
+import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
 import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
 import org.apache.lucene.russian.morphology.dictonary.WordCard;
 import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
@@ -17,10 +17,10 @@ public class SuffixResearcher {
    public static void main(String[] args) throws IOException {
        IgnoredFormReader formReader = new IgnoredFormReader("igoredFrom.txt");
        Set<String> form = formReader.getIngnoredFroms();
-        System.out.println(form);
-        DirtonaryReader dirtonaryReader = new DirtonaryReader("morphs.mrd", form);
+
+        DictonaryReader dictonaryReader = new DictonaryReader("morphs.mrd", form);
        StatiticsCollectors statiticsCollectors = new StatiticsCollectors();
-        dirtonaryReader.proccess(statiticsCollectors);
+        dictonaryReader.proccess(statiticsCollectors);
        Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
        Object[] objects = counterCollection.toArray();
        Arrays.sort(objects);
@@ -33,10 +33,11 @@ public class SuffixResearcher {
        for(int i = 0; i < objects.length; i++){
            evristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
        }
+
        final AtomicInteger good = new AtomicInteger(0);
        final AtomicInteger bad = new AtomicInteger(0);
        final FileWriter writer = new FileWriter("incorret.txt");
-        dirtonaryReader.proccess(new WordProccessor(){
+        dictonaryReader.proccess(new WordProccessor(){
            public void proccess(WordCard wordCard) throws IOException {
                for(String wordForm:wordCard.getWordsFroms()){
                    String cf = wordCard.getCanonicalFrom();
@@ -54,34 +55,5 @@ public class SuffixResearcher {
        System.out.println("Good " + good + " Bad " + bad);

        evristic.writeToFile("evriticsb");
-
-
-//        Map<String, Set<String>> perehod = new HashMap<String,Set<String>>();
-//        for(SuffixCounter suffixCounter:statiticsCollectors.getStatititics().values()){
-//            String sf = suffixCounter.getSuffixEvristic().getFormSuffix();
-//            Set<String> stringSet = perehod.get(sf);
-//            if (stringSet == null){
-//                stringSet = new HashSet<String>();
-//                perehod.put(sf,stringSet);
-//            }
-//            stringSet.add(suffixCounter.getSuffixEvristic().getNormalSuffix());
-//            //suffix.add(suffixCounter.getSuffixEvristic().getFormSuffix());
-//            //System.out.println(suffixCounter.);
-//        }
-//        System.out.println("Diffirent suffix " + perehod.size());
-//        int c = 0;
-//        int max_size = 0;
-//        int[] size_dist = new int[20];
-//        for(int j = 0; j < size_dist.length; j++) size_dist[j] = 0;
-//        for(Set<String> set:perehod.values()){
-//            size_dist[set.size()] ++;
-//            if (set.size() > 1){
-//                c++;
-//                //System.out.println(set);
-//            }
-//            if(set.size() > max_size) max_size = set.size();
-//        }
-//        System.out.println("max size of diffirent suffix " + max_size + " " + c);
-//        for(int j = 0; j < size_dist.length; j++) System.out.println("" + j + " " + size_dist[j]);
    }
 }
--- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristics.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristics.java
@@ -1,4 +1,6 @@
-package org.apache.lucene.russian.morphology.evristics;
+package org.apache.lucene.russian.morphology.analayzer;
+
+import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;

 import java.io.FileReader;
 import java.io.BufferedReader;
@@ -24,13 +26,13 @@ public class ArrayEvristics {

    public String getCanonicalForm(String form) {
        int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
-        Long suffix = RussianSuffixDecoderEncoder.encodeLong(form.substring(startSymbol));
+        Long suffix = RussianSuffixDecoderEncoder.encode(form.substring(startSymbol));

        int index = Arrays.binarySearch(keys,suffix);
        if(index == -1){
            return form;
        }else{
-            String nSuffix = RussianSuffixDecoderEncoder.decodeLong(values[index]);
+            String nSuffix = RussianSuffixDecoderEncoder.decode(values[index]);
            return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
        }
    }
--- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java
@@ -1,6 +1,5 @@
 package org.apache.lucene.russian.morphology.analayzer;

-import org.apache.lucene.russian.morphology.evristics.ArrayEvristics;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java
@@ -1,26 +1,28 @@
 package org.apache.lucene.russian.morphology.dictonary;

-import org.apache.lucene.russian.morphology.dictonary.FlexiaModel;
-import com.frielp.morph.automate.WordImpl;
-import org.apache.lucene.russian.morphology.evristics.RussianSuffixDecoderEncoder;
+import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;

 import java.util.*;
 import java.io.*;


-public class DirtonaryReader {
+/**
+ * This class contain logic how read
+ * dictonary and produce word with it all forms.
+ */
+public class DictonaryReader {
    private String fileName;
    private String fileEncoding = "windows-1251";
    private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
    private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
    private Set<String> ingnoredForm =  new HashSet<String>();

-    public DirtonaryReader(String fileName, Set<String> ingnoredForm) {
+    public DictonaryReader(String fileName, Set<String> ingnoredForm) {
        this.fileName = fileName;
        this.ingnoredForm = ingnoredForm;
    }

-    public DirtonaryReader(String fileName, String fileEncoding, Set<String> ingnoredForm) {
+    public DictonaryReader(String fileName, String fileEncoding, Set<String> ingnoredForm) {
        this.fileName = fileName;
        this.fileEncoding = fileEncoding;
        this.ingnoredForm = ingnoredForm;
@@ -96,6 +98,7 @@ public class DirtonaryReader {

    private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
        String[] fl = line.split("\\*");
+        // we inored all forms thats
      //  if (fl.length == 3)
      //      flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
        if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java
@@ -1,6 +1,8 @@
 package org.apache.lucene.russian.morphology.dictonary;

-
+/**
+ * Represent inofrmation of how word form created form it imutible part.
+ */
 public class FlexiaModel {
    private String code;
    private String suffix;
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java
@@ -3,16 +3,18 @@ package org.apache.lucene.russian.morphology.dictonary;
 import java.util.List;
 import java.util.ArrayList;

-
+/**
+ * Represent word and all it forms.
+ */
 public class WordCard {
    private String canonicalFrom;
    private List<String> wordsFroms = new ArrayList<String>();

-    public WordCard(String canonicalFrom) {
+    protected WordCard(String canonicalFrom) {
        this.canonicalFrom = canonicalFrom;
    }

-    public void addFrom(String word){
+    protected void addFrom(String word){
        wordsFroms.add(word);
    }

--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java
@@ -2,7 +2,10 @@ package org.apache.lucene.russian.morphology.dictonary;

 import java.io.IOException;

-
+/**
+ * Interface allows get information from 
+ * {@org.apache.lucene.russian.morphology.dictonary.DirtonaryReader}.
+ */
 public interface WordProccessor {

    public void proccess(WordCard wordCard) throws IOException;
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java
@@ -1,5 +1,7 @@
 package org.apache.lucene.russian.morphology.evristics;

+import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
+
 import java.util.*;
 import java.io.*;

@@ -8,22 +10,20 @@ public class Evristic {
    private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();

    public void addEvristic(SuffixEvristic suffixEvristic) {
-        Long suffix = RussianSuffixDecoderEncoder.encodeLong(suffixEvristic.getFormSuffix());
+        Long suffix = RussianSuffixDecoderEncoder.encode(suffixEvristic.getFormSuffix());
        Long longs = encodedSuffixesPairs.get(suffix);
        if (longs == null) {
-            encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encodeLong(suffixEvristic.getNormalSuffix()));
+            encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixEvristic.getNormalSuffix()));
        }
-
-
    }

    public String getNormalForm(String form) {
        int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
-        Long suffix = RussianSuffixDecoderEncoder.encodeLong(form.substring(startSymbol));
+        Long suffix = RussianSuffixDecoderEncoder.encode(form.substring(startSymbol));

        Long normalSuffix = encodedSuffixesPairs.get(suffix);
        if (normalSuffix != null) {
-            String nSuffix = RussianSuffixDecoderEncoder.decodeLong(normalSuffix);
+            String nSuffix = RussianSuffixDecoderEncoder.decode(normalSuffix);
            return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;

        }
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/LemmasFreq.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/LemmasFreq.java
@@ -1,6 +0,0 @@
-package org.apache.lucene.russian.morphology.evristics;
-
-
-public class LemmasFreq {
-    
-}
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/RussianSuffixDecoderEncoder.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/RussianSuffixDecoderEncoder.java
@@ -1,60 +0,0 @@
-package org.apache.lucene.russian.morphology.evristics;
-
-
-public class RussianSuffixDecoderEncoder {
-    public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
-    public static final int SUFFIX_LENGTH = 7;
-
-
-    static public Integer encode(String string) {
-        if (string.length() > 6) throw new RuntimeException("suffix to long");
-        int result = 0;
-        for (int i = 0; i < string.length(); i++) {
-            int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
-            if (c < 0) {
-                c = 33;
-            }
-            if (c == 34) c = 6;
-            result = result * 35 + c;
-        }
-        return result;
-    }
-
-    static public String decode(Integer suffixN) {
-        String result = "";
-        while (suffixN > 35) {
-            result = (char) (suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET) + result;
-            suffixN /= 35;
-        }
-        result = (char) (suffixN + RUSSIAN_SMALL_LETTER_OFFSET) + result;
-        return result;
-    }
-
-    static public Long encodeLong(String string) {
-        if (string.length() > 12) throw new RuntimeException("suffix to long");
-        long result = 0L;
-        for (int i = 0; i < string.length(); i++) {
-            int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
-            if (c < 0) {
-                c = 33;
-            }
-            if (c == 34) c = 6;
-            result = result * 35L + c;
-        }
-        return result;
-    }
-
-    static public String decodeLong(Long suffixN) {
-        String result = "";
-        while (suffixN > 35) {
-            long c = suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET;
-            if (c == 33 + RUSSIAN_SMALL_LETTER_OFFSET) c = 45;
-            result = (char) c + result;
-            suffixN /= 35;
-        }
-        long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET;
-        if (c == 33 + RUSSIAN_SMALL_LETTER_OFFSET) c = 45;
-        result = (char) c + result;
-        return result;
-    }
-}
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java
@@ -2,6 +2,7 @@ package org.apache.lucene.russian.morphology.evristics;

 import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
 import org.apache.lucene.russian.morphology.dictonary.WordCard;
+import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;

 import java.util.Map;
 import java.util.HashMap;
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java
@@ -1,6 +1,9 @@
 package org.apache.lucene.russian.morphology.evristics;

-
+/**
+ * Conains information of freqency of suffix evristic
+ *  in dictionary.
+ */
 public class SuffixCounter implements Comparable{
    private SuffixEvristic suffixEvristic;
    private Double amnout = 0.0;
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java
@@ -1,6 +1,11 @@
 package org.apache.lucene.russian.morphology.evristics;

-
+/**
+ * Represent evristic that assume that
+ * canonical from of word is defined by word suffix.
+ * It contains to suffixes from given position of
+ * canonical word form and for form.
+ */
 public class SuffixEvristic {
    private String formSuffix;
    private String normalSuffix;
--- a/src/test/java/org/apache/lucene/AppTest.java
+++ b/src/test/java/org/apache/lucene/AppTest.java
@@ -1,38 +0,0 @@
-package org.apache.lucene;
-
-import junit.framework.Test;
-import junit.framework.TestCase;
-import junit.framework.TestSuite;
-
-/**
- * Unit test for simple App.
- */
-public class AppTest 
-    extends TestCase
-{
-    /**
-     * Create the test case
-     *
-     * @param testName name of the test case
-     */
-    public AppTest( String testName )
-    {
-        super( testName );
-    }
-
-    /**
-     * @return the suite of tests being tested
-     */
-    public static Test suite()
-    {
-        return new TestSuite( AppTest.class );
-    }
-
-    /**
-     * Rigourous Test :-)
-     */
-    public void testApp()
-    {
-        assertTrue( true );
-    }
-}
--- a/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java
+++ b/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java
@@ -0,0 +1,5 @@
+package org.apache.lucene.russian.morphology;
+
+
+public class RussianSuffixDecoderEncoderTest {
+}