diff --git a/pom.xml b/pom.xml index 04468d5..4a32899 100644 --- a/pom.xml +++ b/pom.xml @@ -28,4 +28,18 @@ 2.4.1 + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 1.5 + 1.5 + + + + + diff --git a/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java b/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java new file mode 100644 index 0000000..2f9ca5e --- /dev/null +++ b/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java @@ -0,0 +1,45 @@ +package org.apache.lucene.russian.morphology; + +/** + * This helper class allow encode suffix of russian word + * to long value and decode from it. + * Assumed that suffix contains only small russian letters and dash. + * Also assumed that letter å and ¸ coinsed. + */ +public class RussianSuffixDecoderEncoder { + public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071; + public static final int SUFFIX_LENGTH = 7; + private static final int EE_CHAR = 34; + private static final int E_CHAR = 6; + private static final int DASH_CHAR = 45; + private static final int DASH_CODE = 33; + + + static public Long encode(String string) { + if (string.length() > 12) throw new RuntimeException("suffix to long"); + long result = 0L; + for (int i = 0; i < string.length(); i++) { + int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET; + if (c < 0) { + c = DASH_CODE; + } + if (c == EE_CHAR) c = E_CHAR; + result = result * 35L + c; + } + return result; + } + + static public String decode(Long suffixN) { + String result = ""; + while (suffixN > 35) { + long c = suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET; + if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR; + result = (char) c + result; + suffixN /= 35; + } + long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET; + if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR; + result = (char) c + result; + return result; + } +} diff --git a/src/main/java/org/apache/lucene/russian/morphology/SuffixResearcher.java b/src/main/java/org/apache/lucene/russian/morphology/SuffixResearcher.java index 2367751..259cc77 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/SuffixResearcher.java +++ b/src/main/java/org/apache/lucene/russian/morphology/SuffixResearcher.java @@ -1,6 +1,6 @@ package org.apache.lucene.russian.morphology; -import org.apache.lucene.russian.morphology.dictonary.DirtonaryReader; +import org.apache.lucene.russian.morphology.dictonary.DictonaryReader; import org.apache.lucene.russian.morphology.dictonary.WordProccessor; import org.apache.lucene.russian.morphology.dictonary.WordCard; import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader; @@ -17,10 +17,10 @@ public class SuffixResearcher { public static void main(String[] args) throws IOException { IgnoredFormReader formReader = new IgnoredFormReader("igoredFrom.txt"); Set form = formReader.getIngnoredFroms(); - System.out.println(form); - DirtonaryReader dirtonaryReader = new DirtonaryReader("morphs.mrd", form); + + DictonaryReader dictonaryReader = new DictonaryReader("morphs.mrd", form); StatiticsCollectors statiticsCollectors = new StatiticsCollectors(); - dirtonaryReader.proccess(statiticsCollectors); + dictonaryReader.proccess(statiticsCollectors); Collection counterCollection = statiticsCollectors.getStatititics().values(); Object[] objects = counterCollection.toArray(); Arrays.sort(objects); @@ -33,10 +33,11 @@ public class SuffixResearcher { for(int i = 0; i < objects.length; i++){ evristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic()); } + final AtomicInteger good = new AtomicInteger(0); final AtomicInteger bad = new AtomicInteger(0); final FileWriter writer = new FileWriter("incorret.txt"); - dirtonaryReader.proccess(new WordProccessor(){ + dictonaryReader.proccess(new WordProccessor(){ public void proccess(WordCard wordCard) throws IOException { for(String wordForm:wordCard.getWordsFroms()){ String cf = wordCard.getCanonicalFrom(); @@ -54,34 +55,5 @@ public class SuffixResearcher { System.out.println("Good " + good + " Bad " + bad); evristic.writeToFile("evriticsb"); - - -// Map> perehod = new HashMap>(); -// for(SuffixCounter suffixCounter:statiticsCollectors.getStatititics().values()){ -// String sf = suffixCounter.getSuffixEvristic().getFormSuffix(); -// Set stringSet = perehod.get(sf); -// if (stringSet == null){ -// stringSet = new HashSet(); -// perehod.put(sf,stringSet); -// } -// stringSet.add(suffixCounter.getSuffixEvristic().getNormalSuffix()); -// //suffix.add(suffixCounter.getSuffixEvristic().getFormSuffix()); -// //System.out.println(suffixCounter.); -// } -// System.out.println("Diffirent suffix " + perehod.size()); -// int c = 0; -// int max_size = 0; -// int[] size_dist = new int[20]; -// for(int j = 0; j < size_dist.length; j++) size_dist[j] = 0; -// for(Set set:perehod.values()){ -// size_dist[set.size()] ++; -// if (set.size() > 1){ -// c++; -// //System.out.println(set); -// } -// if(set.size() > max_size) max_size = set.size(); -// } -// System.out.println("max size of diffirent suffix " + max_size + " " + c); -// for(int j = 0; j < size_dist.length; j++) System.out.println("" + j + " " + size_dist[j]); } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/ArrayEvristics.java b/src/main/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristics.java similarity index 76% rename from src/main/java/org/apache/lucene/russian/morphology/evristics/ArrayEvristics.java rename to src/main/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristics.java index 1d529af..c91046f 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/evristics/ArrayEvristics.java +++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/ArrayEvristics.java @@ -1,4 +1,6 @@ -package org.apache.lucene.russian.morphology.evristics; +package org.apache.lucene.russian.morphology.analayzer; + +import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; import java.io.FileReader; import java.io.BufferedReader; @@ -24,13 +26,13 @@ public class ArrayEvristics { public String getCanonicalForm(String form) { int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0; - Long suffix = RussianSuffixDecoderEncoder.encodeLong(form.substring(startSymbol)); + Long suffix = RussianSuffixDecoderEncoder.encode(form.substring(startSymbol)); int index = Arrays.binarySearch(keys,suffix); if(index == -1){ return form; }else{ - String nSuffix = RussianSuffixDecoderEncoder.decodeLong(values[index]); + String nSuffix = RussianSuffixDecoderEncoder.decode(values[index]); return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix; } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java index a7a2399..972467b 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java +++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java @@ -1,6 +1,5 @@ package org.apache.lucene.russian.morphology.analayzer; -import org.apache.lucene.russian.morphology.evristics.ArrayEvristics; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/DirtonaryReader.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java similarity index 91% rename from src/main/java/org/apache/lucene/russian/morphology/dictonary/DirtonaryReader.java rename to src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java index ce00320..bbf5679 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/DirtonaryReader.java +++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java @@ -1,26 +1,28 @@ package org.apache.lucene.russian.morphology.dictonary; -import org.apache.lucene.russian.morphology.dictonary.FlexiaModel; -import com.frielp.morph.automate.WordImpl; -import org.apache.lucene.russian.morphology.evristics.RussianSuffixDecoderEncoder; +import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; import java.util.*; import java.io.*; -public class DirtonaryReader { +/** + * This class contain logic how read + * dictonary and produce word with it all forms. + */ +public class DictonaryReader { private String fileName; private String fileEncoding = "windows-1251"; private List> wordsFlexias = new ArrayList>(); private List> wordPrefixes = new ArrayList>(); private Set ingnoredForm = new HashSet(); - public DirtonaryReader(String fileName, Set ingnoredForm) { + public DictonaryReader(String fileName, Set ingnoredForm) { this.fileName = fileName; this.ingnoredForm = ingnoredForm; } - public DirtonaryReader(String fileName, String fileEncoding, Set ingnoredForm) { + public DictonaryReader(String fileName, String fileEncoding, Set ingnoredForm) { this.fileName = fileName; this.fileEncoding = fileEncoding; this.ingnoredForm = ingnoredForm; @@ -96,6 +98,7 @@ public class DirtonaryReader { private void addFlexia(ArrayList flexiaModelArrayList, String line) { String[] fl = line.split("\\*"); + // we inored all forms thats // if (fl.length == 3) // flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase())); if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), "")); diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java index 613f24c..e983311 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java +++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java @@ -1,6 +1,8 @@ package org.apache.lucene.russian.morphology.dictonary; - +/** + * Represent inofrmation of how word form created form it imutible part. + */ public class FlexiaModel { private String code; private String suffix; diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java index 5ff29e4..a37b107 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java +++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java @@ -3,16 +3,18 @@ package org.apache.lucene.russian.morphology.dictonary; import java.util.List; import java.util.ArrayList; - +/** + * Represent word and all it forms. + */ public class WordCard { private String canonicalFrom; private List wordsFroms = new ArrayList(); - public WordCard(String canonicalFrom) { + protected WordCard(String canonicalFrom) { this.canonicalFrom = canonicalFrom; } - public void addFrom(String word){ + protected void addFrom(String word){ wordsFroms.add(word); } diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java index 5108e3f..3f93f43 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java +++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java @@ -2,7 +2,10 @@ package org.apache.lucene.russian.morphology.dictonary; import java.io.IOException; - +/** + * Interface allows get information from + * {@org.apache.lucene.russian.morphology.dictonary.DirtonaryReader}. + */ public interface WordProccessor { public void proccess(WordCard wordCard) throws IOException; diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java index f195c32..79ce5f4 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java +++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java @@ -1,5 +1,7 @@ package org.apache.lucene.russian.morphology.evristics; +import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; + import java.util.*; import java.io.*; @@ -8,22 +10,20 @@ public class Evristic { private TreeMap encodedSuffixesPairs = new TreeMap(); public void addEvristic(SuffixEvristic suffixEvristic) { - Long suffix = RussianSuffixDecoderEncoder.encodeLong(suffixEvristic.getFormSuffix()); + Long suffix = RussianSuffixDecoderEncoder.encode(suffixEvristic.getFormSuffix()); Long longs = encodedSuffixesPairs.get(suffix); if (longs == null) { - encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encodeLong(suffixEvristic.getNormalSuffix())); + encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixEvristic.getNormalSuffix())); } - - } public String getNormalForm(String form) { int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0; - Long suffix = RussianSuffixDecoderEncoder.encodeLong(form.substring(startSymbol)); + Long suffix = RussianSuffixDecoderEncoder.encode(form.substring(startSymbol)); Long normalSuffix = encodedSuffixesPairs.get(suffix); if (normalSuffix != null) { - String nSuffix = RussianSuffixDecoderEncoder.decodeLong(normalSuffix); + String nSuffix = RussianSuffixDecoderEncoder.decode(normalSuffix); return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix; } diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/LemmasFreq.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/LemmasFreq.java deleted file mode 100644 index 24784df..0000000 --- a/src/main/java/org/apache/lucene/russian/morphology/evristics/LemmasFreq.java +++ /dev/null @@ -1,6 +0,0 @@ -package org.apache.lucene.russian.morphology.evristics; - - -public class LemmasFreq { - -} diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/RussianSuffixDecoderEncoder.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/RussianSuffixDecoderEncoder.java deleted file mode 100644 index cd837c1..0000000 --- a/src/main/java/org/apache/lucene/russian/morphology/evristics/RussianSuffixDecoderEncoder.java +++ /dev/null @@ -1,60 +0,0 @@ -package org.apache.lucene.russian.morphology.evristics; - - -public class RussianSuffixDecoderEncoder { - public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071; - public static final int SUFFIX_LENGTH = 7; - - - static public Integer encode(String string) { - if (string.length() > 6) throw new RuntimeException("suffix to long"); - int result = 0; - for (int i = 0; i < string.length(); i++) { - int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET; - if (c < 0) { - c = 33; - } - if (c == 34) c = 6; - result = result * 35 + c; - } - return result; - } - - static public String decode(Integer suffixN) { - String result = ""; - while (suffixN > 35) { - result = (char) (suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET) + result; - suffixN /= 35; - } - result = (char) (suffixN + RUSSIAN_SMALL_LETTER_OFFSET) + result; - return result; - } - - static public Long encodeLong(String string) { - if (string.length() > 12) throw new RuntimeException("suffix to long"); - long result = 0L; - for (int i = 0; i < string.length(); i++) { - int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET; - if (c < 0) { - c = 33; - } - if (c == 34) c = 6; - result = result * 35L + c; - } - return result; - } - - static public String decodeLong(Long suffixN) { - String result = ""; - while (suffixN > 35) { - long c = suffixN % 35 + RUSSIAN_SMALL_LETTER_OFFSET; - if (c == 33 + RUSSIAN_SMALL_LETTER_OFFSET) c = 45; - result = (char) c + result; - suffixN /= 35; - } - long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET; - if (c == 33 + RUSSIAN_SMALL_LETTER_OFFSET) c = 45; - result = (char) c + result; - return result; - } -} diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java index a93249e..e3271ff 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java +++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java @@ -2,6 +2,7 @@ package org.apache.lucene.russian.morphology.evristics; import org.apache.lucene.russian.morphology.dictonary.WordProccessor; import org.apache.lucene.russian.morphology.dictonary.WordCard; +import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; import java.util.Map; import java.util.HashMap; diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java index 34dd9b0..65462a8 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java +++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java @@ -1,6 +1,9 @@ package org.apache.lucene.russian.morphology.evristics; - +/** + * Conains information of freqency of suffix evristic + * in dictionary. + */ public class SuffixCounter implements Comparable{ private SuffixEvristic suffixEvristic; private Double amnout = 0.0; diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java index a7a23aa..ab5e1df 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java +++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java @@ -1,6 +1,11 @@ package org.apache.lucene.russian.morphology.evristics; - +/** + * Represent evristic that assume that + * canonical from of word is defined by word suffix. + * It contains to suffixes from given position of + * canonical word form and for form. + */ public class SuffixEvristic { private String formSuffix; private String normalSuffix; diff --git a/src/test/java/org/apache/lucene/AppTest.java b/src/test/java/org/apache/lucene/AppTest.java deleted file mode 100644 index 570f436..0000000 --- a/src/test/java/org/apache/lucene/AppTest.java +++ /dev/null @@ -1,38 +0,0 @@ -package org.apache.lucene; - -import junit.framework.Test; -import junit.framework.TestCase; -import junit.framework.TestSuite; - -/** - * Unit test for simple App. - */ -public class AppTest - extends TestCase -{ - /** - * Create the test case - * - * @param testName name of the test case - */ - public AppTest( String testName ) - { - super( testName ); - } - - /** - * @return the suite of tests being tested - */ - public static Test suite() - { - return new TestSuite( AppTest.class ); - } - - /** - * Rigourous Test :-) - */ - public void testApp() - { - assertTrue( true ); - } -} diff --git a/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java b/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java new file mode 100644 index 0000000..c343d34 --- /dev/null +++ b/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java @@ -0,0 +1,5 @@ +package org.apache.lucene.russian.morphology; + + +public class RussianSuffixDecoderEncoderTest { +}