diff --git a/morph/src/main/java/org/apache/lucene/morphology/LuceneMorph.java b/morph/src/main/java/org/apache/lucene/morphology/LuceneMorph.java index 067d536..493ade3 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/LuceneMorph.java +++ b/morph/src/main/java/org/apache/lucene/morphology/LuceneMorph.java @@ -23,10 +23,9 @@ import java.util.List; public class LuceneMorph extends Morph { - LetterDecoderEncoder decoderEncoder; - public LuceneMorph(String fileName) throws IOException { - super(fileName); + public LuceneMorph(String fileName,LetterDecoderEncoder decoderEncoder) throws IOException { + super(fileName,decoderEncoder); } @Override diff --git a/morph/src/main/java/org/apache/lucene/morphology/Morph.java b/morph/src/main/java/org/apache/lucene/morphology/Morph.java index b40d933..3718792 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/Morph.java +++ b/morph/src/main/java/org/apache/lucene/morphology/Morph.java @@ -29,11 +29,12 @@ public class Morph { protected short[] rulesId; protected Heuristic[][] rules; protected String[] grammaInfo; - LetterDecoderEncoder decoderEncoder; + protected LetterDecoderEncoder decoderEncoder; - public Morph(String fileName) throws IOException { + public Morph(String fileName,LetterDecoderEncoder decoderEncoder) throws IOException { readFromFile(fileName); + this.decoderEncoder = decoderEncoder; } public Morph(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) { diff --git a/morph/src/main/java/org/apache/lucene/morphology/analayzer/RussianMorphlogyAnalayzer.java b/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyAnalayzer.java similarity index 76% rename from morph/src/main/java/org/apache/lucene/morphology/analayzer/RussianMorphlogyAnalayzer.java rename to morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyAnalayzer.java index 67c12ec..296d8cc 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/analayzer/RussianMorphlogyAnalayzer.java +++ b/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyAnalayzer.java @@ -22,21 +22,22 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.morphology.LuceneMorph; +import org.apache.lucene.morphology.LetterDecoderEncoder; import java.io.IOException; import java.io.Reader; -public class RussianMorphlogyAnalayzer extends Analyzer { +public class MorphlogyAnalayzer extends Analyzer { private LuceneMorph luceneMorph; - public RussianMorphlogyAnalayzer() throws IOException { - luceneMorph = new LuceneMorph("sep.txt"); + public MorphlogyAnalayzer(String pathToMorph,LetterDecoderEncoder letterDecoderEncoder) throws IOException { + luceneMorph = new LuceneMorph("sep.txt",letterDecoderEncoder); } public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); - return new RussianMorphlogyFilter(result, luceneMorph); + return new MorphlogyFilter(result, luceneMorph); } } diff --git a/morph/src/main/java/org/apache/lucene/morphology/analayzer/RussianMorphlogyFilter.java b/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyFilter.java similarity index 91% rename from morph/src/main/java/org/apache/lucene/morphology/analayzer/RussianMorphlogyFilter.java rename to morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyFilter.java index ca3a1a0..4f75ad5 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/analayzer/RussianMorphlogyFilter.java +++ b/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyFilter.java @@ -26,10 +26,10 @@ import java.util.ArrayList; import java.util.List; -public class RussianMorphlogyFilter extends TokenFilter { +public class MorphlogyFilter extends TokenFilter { private LuceneMorph luceneMorph; - public RussianMorphlogyFilter(TokenStream tokenStream, LuceneMorph luceneMorph) { + public MorphlogyFilter(TokenStream tokenStream, LuceneMorph luceneMorph) { super(tokenStream); this.luceneMorph = luceneMorph; } @@ -54,6 +54,7 @@ public class RussianMorphlogyFilter extends TokenFilter { Token nextToken = input.next(reusableToken); if (nextToken == null) return null; // EOS; iterator exhausted Character testC = nextToken.term().charAt(0); + //todo check here for decoder endocoder if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC) { return nextToken; } diff --git a/russian/src/main/java/org/apache/lucene/morphology/russian/HeuristicBuilder.java b/russian/src/main/java/org/apache/lucene/morphology/russian/HeuristicBuilder.java index 9ac57d2..74839bb 100644 --- a/russian/src/main/java/org/apache/lucene/morphology/russian/HeuristicBuilder.java +++ b/russian/src/main/java/org/apache/lucene/morphology/russian/HeuristicBuilder.java @@ -31,7 +31,7 @@ public class HeuristicBuilder { GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab"); DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form); - RussianSuffixDecoderEncoder decoderEncoder = new RussianSuffixDecoderEncoder(); + RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder(); StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo, decoderEncoder); dictonaryReader.proccess(statiticsCollector); statiticsCollector.saveHeuristic(); diff --git a/russian/src/main/java/org/apache/lucene/morphology/russian/RussianSuffixDecoderEncoder.java b/russian/src/main/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoder.java similarity index 92% rename from russian/src/main/java/org/apache/lucene/morphology/russian/RussianSuffixDecoderEncoder.java rename to russian/src/main/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoder.java index d2f3947..2fd7f9b 100644 --- a/russian/src/main/java/org/apache/lucene/morphology/russian/RussianSuffixDecoderEncoder.java +++ b/russian/src/main/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoder.java @@ -28,7 +28,7 @@ import java.util.ArrayList; * Assumed that suffix contains only small russian letters and dash. * Also assumed that letter � and � coinsed. */ -public class RussianSuffixDecoderEncoder implements LetterDecoderEncoder { +public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071; static public int SUFFIX_LENGTH = 6; public static final int EE_CHAR = 34; @@ -107,6 +107,6 @@ public class RussianSuffixDecoderEncoder implements LetterDecoderEncoder { } public String cleanString(String s) { - return s.replace((char) (34 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET)); + return s.replace((char) (34 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET)); } } diff --git a/russian/src/main/java/org/apache/lucene/morphology/russian/Test.java b/russian/src/main/java/org/apache/lucene/morphology/russian/Test.java index ad972a3..a28c7a5 100644 --- a/russian/src/main/java/org/apache/lucene/morphology/russian/Test.java +++ b/russian/src/main/java/org/apache/lucene/morphology/russian/Test.java @@ -35,7 +35,7 @@ public class Test { public static void main(String[] args) throws IOException, ClassNotFoundException { // - Morph splitter = new Morph("sep.txt"); + Morph splitter = new Morph("sep.txt",new RussianLetterDecoderEncoder()); TreeSet shorts = new TreeSet(); int count = 0; TreeMap rulesStat = new TreeMap(); @@ -57,7 +57,6 @@ public class Test { System.out.println(count); System.out.println(rulesStat); System.gc(); - System.out.println("Ready"); System.in.read(); } } diff --git a/russian/src/test/java/org/apache/lucene/morphology/russian/AnalayzerTest.java b/russian/src/test/java/org/apache/lucene/morphology/russian/AnalayzerTest.java new file mode 100644 index 0000000..8fee84e --- /dev/null +++ b/russian/src/test/java/org/apache/lucene/morphology/russian/AnalayzerTest.java @@ -0,0 +1,11 @@ +package org.apache.lucene.morphology.russian; + +/** + * Created by IntelliJ IDEA. + * User: akuznetsov + * Date: 03/10/2009 + * Time: 3:52:43 PM + * To change this template use File | Settings | File Templates. + */ +public class AnalayzerTest { +} diff --git a/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoderTest.java b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoderTest.java new file mode 100644 index 0000000..0479c46 --- /dev/null +++ b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoderTest.java @@ -0,0 +1,30 @@ +package org.apache.lucene.morphology.russian; + +import junit.framework.TestCase; + + +public class RussianLetterDecoderEncoderTest extends TestCase { + public void testEncode() { + // Add your code here + } + + public void testEncodeToArray() { + // Add your code here + } + + public void testDecodeArray() { + // Add your code here + } + + public void testDecode() { + // Add your code here + } + + public void testCheckCharacter() { + // Add your code here + } + + public void testCleanString() { + // Add your code here + } +} diff --git a/russian/src/test/java/org/apache/lucene/morphology/russian/RussianMorphTest.java b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianMorphTest.java new file mode 100644 index 0000000..8aad6d8 --- /dev/null +++ b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianMorphTest.java @@ -0,0 +1,11 @@ +package org.apache.lucene.morphology.russian; + +/** + * Created by IntelliJ IDEA. + * User: akuznetsov + * Date: 03/10/2009 + * Time: 3:52:18 PM + * To change this template use File | Settings | File Templates. + */ +public class RussianMorphTest { +}