diff --git a/pom.xml b/pom.xml
index 04468d5..4a32899 100644
--- a/pom.xml
+++ b/pom.xml
@@ -28,4 +28,18 @@
2.4.1
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+
+ 1.5
+ 1.5
+
+
+
+
+
diff --git a/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java b/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java
new file mode 100644
index 0000000..2f9ca5e
--- /dev/null
+++ b/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java
@@ -0,0 +1,45 @@
+package org.apache.lucene.russian.morphology;
+
+/**
+ * This helper class allow encode suffix of russian word
+ * to long value and decode from it.
+ * Assumed that suffix contains only small russian letters and dash.
+ * Also assumed that letter å and ¸ coinsed.
+ */
+public class RussianSuffixDecoderEncoder {
+ public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
+ public static final int SUFFIX_LENGTH = 7;
+ private static final int EE_CHAR = 34;
+ private static final int E_CHAR = 6;
+ private static final int DASH_CHAR = 45;
+ private static final int DASH_CODE = 33;
+
+
+ static public Long encode(String string) {
+ if (string.length() > 12) throw new RuntimeException("suffix to long");
+ long result = 0L;
+ for (int i = 0; i < string.length(); i++) {
+ int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
+ if (c < 0) {
+ c = DASH_CODE;
+ }
+ if (c == EE_CHAR) c = E_CHAR;
+ result = result * 35L + c;
+ }
+ return result;
+ }
+
+ static public String decode(Long suffixN) {
+ String result = "";
+ while (suffixN > 35) {
+ long c = suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET;
+ if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR;
+ result = (char) c + result;
+ suffixN /= 35;
+ }
+ long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET;
+ if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR;
+ result = (char) c + result;
+ return result;
+ }
+}
diff --git a/src/main/java/org/apache/lucene/russian/morphology/SuffixResearcher.java b/src/main/java/org/apache/lucene/russian/morphology/SuffixResearcher.java
index 2367751..259cc77 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/SuffixResearcher.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/SuffixResearcher.java
@@ -1,6 +1,6 @@
package org.apache.lucene.russian.morphology;
-import org.apache.lucene.russian.morphology.dictonary.DirtonaryReader;
+import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
import org.apache.lucene.russian.morphology.dictonary.WordCard;
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
@@ -17,10 +17,10 @@ public class SuffixResearcher {
public static void main(String[] args) throws IOException {
IgnoredFormReader formReader = new IgnoredFormReader("igoredFrom.txt");
Set form = formReader.getIngnoredFroms();
- System.out.println(form);
- DirtonaryReader dirtonaryReader = new DirtonaryReader("morphs.mrd", form);
+
+ DictonaryReader dictonaryReader = new DictonaryReader("morphs.mrd", form);
StatiticsCollectors statiticsCollectors = new StatiticsCollectors();
- dirtonaryReader.proccess(statiticsCollectors);
+ dictonaryReader.proccess(statiticsCollectors);
Collection counterCollection = statiticsCollectors.getStatititics().values();
Object[] objects = counterCollection.toArray();
Arrays.sort(objects);
@@ -33,10 +33,11 @@ public class SuffixResearcher {
for(int i = 0; i < objects.length; i++){
evristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
}
+
final AtomicInteger good = new AtomicInteger(0);
final AtomicInteger bad = new AtomicInteger(0);
final FileWriter writer = new FileWriter("incorret.txt");
- dirtonaryReader.proccess(new WordProccessor(){
+ dictonaryReader.proccess(new WordProccessor(){
public void proccess(WordCard wordCard) throws IOException {
for(String wordForm:wordCard.getWordsFroms()){
String cf = wordCard.getCanonicalFrom();
@@ -54,34 +55,5 @@ public class SuffixResearcher {
System.out.println("Good " + good + " Bad " + bad);
evristic.writeToFile("evriticsb");
-
-
-// Map> perehod = new HashMap>();
-// for(SuffixCounter suffixCounter:statiticsCollectors.getStatititics().values()){
-// String sf = suffixCounter.getSuffixEvristic().getFormSuffix();
-// Set stringSet = perehod.get(sf);
-// if (stringSet == null){
-// stringSet = new HashSet();
-// perehod.put(sf,stringSet);
-// }
-// stringSet.add(suffixCounter.getSuffixEvristic().getNormalSuffix());
-// //suffix.add(suffixCounter.getSuffixEvristic().getFormSuffix());
-// //System.out.println(suffixCounter.);
-// }
-// System.out.println("Diffirent suffix " + perehod.size());
-// int c = 0;
-// int max_size = 0;
-// int[] size_dist = new int[20];
-// for(int j = 0; j < size_dist.length; j++) size_dist[j] = 0;
-// for(Set set:perehod.values()){
-// size_dist[set.size()] ++;
-// if (set.size() > 1){
-// c++;
-// //System.out.println(set);
-// }
-// if(set.size() > max_size) max_size = set.size();
-// }
-// System.out.println("max size of diffirent suffix " + max_size + " " + c);
-// for(int j = 0; j < size_dist.length; j++) System.out.println("" + j + " " + size_dist[j]);
}
}
diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/ArrayEvristics.java b/src/main/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristics.java
similarity index 76%
rename from src/main/java/org/apache/lucene/russian/morphology/evristics/ArrayEvristics.java
rename to src/main/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristics.java
index 1d529af..c91046f 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/ArrayEvristics.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristics.java
@@ -1,4 +1,6 @@
-package org.apache.lucene.russian.morphology.evristics;
+package org.apache.lucene.russian.morphology.analayzer;
+
+import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import java.io.FileReader;
import java.io.BufferedReader;
@@ -24,13 +26,13 @@ public class ArrayEvristics {
public String getCanonicalForm(String form) {
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
- Long suffix = RussianSuffixDecoderEncoder.encodeLong(form.substring(startSymbol));
+ Long suffix = RussianSuffixDecoderEncoder.encode(form.substring(startSymbol));
int index = Arrays.binarySearch(keys,suffix);
if(index == -1){
return form;
}else{
- String nSuffix = RussianSuffixDecoderEncoder.decodeLong(values[index]);
+ String nSuffix = RussianSuffixDecoderEncoder.decode(values[index]);
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
}
}
diff --git a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java
index a7a2399..972467b 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java
@@ -1,6 +1,5 @@
package org.apache.lucene.russian.morphology.analayzer;
-import org.apache.lucene.russian.morphology.evristics.ArrayEvristics;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/DirtonaryReader.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java
similarity index 91%
rename from src/main/java/org/apache/lucene/russian/morphology/dictonary/DirtonaryReader.java
rename to src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java
index ce00320..bbf5679 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/DirtonaryReader.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java
@@ -1,26 +1,28 @@
package org.apache.lucene.russian.morphology.dictonary;
-import org.apache.lucene.russian.morphology.dictonary.FlexiaModel;
-import com.frielp.morph.automate.WordImpl;
-import org.apache.lucene.russian.morphology.evristics.RussianSuffixDecoderEncoder;
+import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import java.util.*;
import java.io.*;
-public class DirtonaryReader {
+/**
+ * This class contain logic how read
+ * dictonary and produce word with it all forms.
+ */
+public class DictonaryReader {
private String fileName;
private String fileEncoding = "windows-1251";
private List> wordsFlexias = new ArrayList>();
private List> wordPrefixes = new ArrayList>();
private Set ingnoredForm = new HashSet();
- public DirtonaryReader(String fileName, Set ingnoredForm) {
+ public DictonaryReader(String fileName, Set ingnoredForm) {
this.fileName = fileName;
this.ingnoredForm = ingnoredForm;
}
- public DirtonaryReader(String fileName, String fileEncoding, Set ingnoredForm) {
+ public DictonaryReader(String fileName, String fileEncoding, Set ingnoredForm) {
this.fileName = fileName;
this.fileEncoding = fileEncoding;
this.ingnoredForm = ingnoredForm;
@@ -96,6 +98,7 @@ public class DirtonaryReader {
private void addFlexia(ArrayList flexiaModelArrayList, String line) {
String[] fl = line.split("\\*");
+ // we inored all forms thats
// if (fl.length == 3)
// flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java
index 613f24c..e983311 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java
@@ -1,6 +1,8 @@
package org.apache.lucene.russian.morphology.dictonary;
-
+/**
+ * Represent inofrmation of how word form created form it imutible part.
+ */
public class FlexiaModel {
private String code;
private String suffix;
diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java
index 5ff29e4..a37b107 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java
@@ -3,16 +3,18 @@ package org.apache.lucene.russian.morphology.dictonary;
import java.util.List;
import java.util.ArrayList;
-
+/**
+ * Represent word and all it forms.
+ */
public class WordCard {
private String canonicalFrom;
private List wordsFroms = new ArrayList();
- public WordCard(String canonicalFrom) {
+ protected WordCard(String canonicalFrom) {
this.canonicalFrom = canonicalFrom;
}
- public void addFrom(String word){
+ protected void addFrom(String word){
wordsFroms.add(word);
}
diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java
index 5108e3f..3f93f43 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java
@@ -2,7 +2,10 @@ package org.apache.lucene.russian.morphology.dictonary;
import java.io.IOException;
-
+/**
+ * Interface allows get information from
+ * {@org.apache.lucene.russian.morphology.dictonary.DirtonaryReader}.
+ */
public interface WordProccessor {
public void proccess(WordCard wordCard) throws IOException;
diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java
index f195c32..79ce5f4 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java
@@ -1,5 +1,7 @@
package org.apache.lucene.russian.morphology.evristics;
+import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
+
import java.util.*;
import java.io.*;
@@ -8,22 +10,20 @@ public class Evristic {
private TreeMap encodedSuffixesPairs = new TreeMap();
public void addEvristic(SuffixEvristic suffixEvristic) {
- Long suffix = RussianSuffixDecoderEncoder.encodeLong(suffixEvristic.getFormSuffix());
+ Long suffix = RussianSuffixDecoderEncoder.encode(suffixEvristic.getFormSuffix());
Long longs = encodedSuffixesPairs.get(suffix);
if (longs == null) {
- encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encodeLong(suffixEvristic.getNormalSuffix()));
+ encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixEvristic.getNormalSuffix()));
}
-
-
}
public String getNormalForm(String form) {
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
- Long suffix = RussianSuffixDecoderEncoder.encodeLong(form.substring(startSymbol));
+ Long suffix = RussianSuffixDecoderEncoder.encode(form.substring(startSymbol));
Long normalSuffix = encodedSuffixesPairs.get(suffix);
if (normalSuffix != null) {
- String nSuffix = RussianSuffixDecoderEncoder.decodeLong(normalSuffix);
+ String nSuffix = RussianSuffixDecoderEncoder.decode(normalSuffix);
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
}
diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/LemmasFreq.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/LemmasFreq.java
deleted file mode 100644
index 24784df..0000000
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/LemmasFreq.java
+++ /dev/null
@@ -1,6 +0,0 @@
-package org.apache.lucene.russian.morphology.evristics;
-
-
-public class LemmasFreq {
-
-}
diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/RussianSuffixDecoderEncoder.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/RussianSuffixDecoderEncoder.java
deleted file mode 100644
index cd837c1..0000000
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/RussianSuffixDecoderEncoder.java
+++ /dev/null
@@ -1,60 +0,0 @@
-package org.apache.lucene.russian.morphology.evristics;
-
-
-public class RussianSuffixDecoderEncoder {
- public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
- public static final int SUFFIX_LENGTH = 7;
-
-
- static public Integer encode(String string) {
- if (string.length() > 6) throw new RuntimeException("suffix to long");
- int result = 0;
- for (int i = 0; i < string.length(); i++) {
- int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
- if (c < 0) {
- c = 33;
- }
- if (c == 34) c = 6;
- result = result * 35 + c;
- }
- return result;
- }
-
- static public String decode(Integer suffixN) {
- String result = "";
- while (suffixN > 35) {
- result = (char) (suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET) + result;
- suffixN /= 35;
- }
- result = (char) (suffixN + RUSSIAN_SMALL_LETTER_OFFSET) + result;
- return result;
- }
-
- static public Long encodeLong(String string) {
- if (string.length() > 12) throw new RuntimeException("suffix to long");
- long result = 0L;
- for (int i = 0; i < string.length(); i++) {
- int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
- if (c < 0) {
- c = 33;
- }
- if (c == 34) c = 6;
- result = result * 35L + c;
- }
- return result;
- }
-
- static public String decodeLong(Long suffixN) {
- String result = "";
- while (suffixN > 35) {
- long c = suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET;
- if (c == 33 + RUSSIAN_SMALL_LETTER_OFFSET) c = 45;
- result = (char) c + result;
- suffixN /= 35;
- }
- long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET;
- if (c == 33 + RUSSIAN_SMALL_LETTER_OFFSET) c = 45;
- result = (char) c + result;
- return result;
- }
-}
diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java
index a93249e..e3271ff 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java
@@ -2,6 +2,7 @@ package org.apache.lucene.russian.morphology.evristics;
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
import org.apache.lucene.russian.morphology.dictonary.WordCard;
+import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import java.util.Map;
import java.util.HashMap;
diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java
index 34dd9b0..65462a8 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java
@@ -1,6 +1,9 @@
package org.apache.lucene.russian.morphology.evristics;
-
+/**
+ * Conains information of freqency of suffix evristic
+ * in dictionary.
+ */
public class SuffixCounter implements Comparable{
private SuffixEvristic suffixEvristic;
private Double amnout = 0.0;
diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java
index a7a23aa..ab5e1df 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java
@@ -1,6 +1,11 @@
package org.apache.lucene.russian.morphology.evristics;
-
+/**
+ * Represent evristic that assume that
+ * canonical from of word is defined by word suffix.
+ * It contains to suffixes from given position of
+ * canonical word form and for form.
+ */
public class SuffixEvristic {
private String formSuffix;
private String normalSuffix;
diff --git a/src/test/java/org/apache/lucene/AppTest.java b/src/test/java/org/apache/lucene/AppTest.java
deleted file mode 100644
index 570f436..0000000
--- a/src/test/java/org/apache/lucene/AppTest.java
+++ /dev/null
@@ -1,38 +0,0 @@
-package org.apache.lucene;
-
-import junit.framework.Test;
-import junit.framework.TestCase;
-import junit.framework.TestSuite;
-
-/**
- * Unit test for simple App.
- */
-public class AppTest
- extends TestCase
-{
- /**
- * Create the test case
- *
- * @param testName name of the test case
- */
- public AppTest( String testName )
- {
- super( testName );
- }
-
- /**
- * @return the suite of tests being tested
- */
- public static Test suite()
- {
- return new TestSuite( AppTest.class );
- }
-
- /**
- * Rigourous Test :-)
- */
- public void testApp()
- {
- assertTrue( true );
- }
-}
diff --git a/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java b/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java
new file mode 100644
index 0000000..c343d34
--- /dev/null
+++ b/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java
@@ -0,0 +1,5 @@
+package org.apache.lucene.russian.morphology;
+
+
+public class RussianSuffixDecoderEncoderTest {
+}