From fed6cd480a8590343470ac147d6e6726835b8312 Mon Sep 17 00:00:00 2001 From: "alexander.a.kuznetsov" Date: Sat, 3 Oct 2009 10:58:54 +0000 Subject: [PATCH] working on new version git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@51 d817d54c-26ab-11de-abc9-2f7d1455ff7a --- .../apache/lucene/morphology/LuceneMorph.java | 5 ++-- .../org/apache/lucene/morphology/Morph.java | 5 ++-- ...Analayzer.java => MorphlogyAnalayzer.java} | 9 +++--- ...phlogyFilter.java => MorphlogyFilter.java} | 5 ++-- .../morphology/russian/HeuristicBuilder.java | 2 +- ....java => RussianLetterDecoderEncoder.java} | 4 +-- .../lucene/morphology/russian/Test.java | 3 +- .../morphology/russian/AnalayzerTest.java | 11 +++++++ .../RussianLetterDecoderEncoderTest.java | 30 +++++++++++++++++++ .../morphology/russian/RussianMorphTest.java | 11 +++++++ 10 files changed, 69 insertions(+), 16 deletions(-) rename morph/src/main/java/org/apache/lucene/morphology/analayzer/{RussianMorphlogyAnalayzer.java => MorphlogyAnalayzer.java} (76%) rename morph/src/main/java/org/apache/lucene/morphology/analayzer/{RussianMorphlogyFilter.java => MorphlogyFilter.java} (91%) rename russian/src/main/java/org/apache/lucene/morphology/russian/{RussianSuffixDecoderEncoder.java => RussianLetterDecoderEncoder.java} (92%) create mode 100644 russian/src/test/java/org/apache/lucene/morphology/russian/AnalayzerTest.java create mode 100644 russian/src/test/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoderTest.java create mode 100644 russian/src/test/java/org/apache/lucene/morphology/russian/RussianMorphTest.java diff --git a/morph/src/main/java/org/apache/lucene/morphology/LuceneMorph.java b/morph/src/main/java/org/apache/lucene/morphology/LuceneMorph.java index 067d536..493ade3 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/LuceneMorph.java +++ b/morph/src/main/java/org/apache/lucene/morphology/LuceneMorph.java @@ -23,10 +23,9 @@ import java.util.List; public class LuceneMorph extends Morph { - LetterDecoderEncoder decoderEncoder; - public LuceneMorph(String fileName) throws IOException { - super(fileName); + public LuceneMorph(String fileName,LetterDecoderEncoder decoderEncoder) throws IOException { + super(fileName,decoderEncoder); } @Override diff --git a/morph/src/main/java/org/apache/lucene/morphology/Morph.java b/morph/src/main/java/org/apache/lucene/morphology/Morph.java index b40d933..3718792 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/Morph.java +++ b/morph/src/main/java/org/apache/lucene/morphology/Morph.java @@ -29,11 +29,12 @@ public class Morph { protected short[] rulesId; protected Heuristic[][] rules; protected String[] grammaInfo; - LetterDecoderEncoder decoderEncoder; + protected LetterDecoderEncoder decoderEncoder; - public Morph(String fileName) throws IOException { + public Morph(String fileName,LetterDecoderEncoder decoderEncoder) throws IOException { readFromFile(fileName); + this.decoderEncoder = decoderEncoder; } public Morph(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) { diff --git a/morph/src/main/java/org/apache/lucene/morphology/analayzer/RussianMorphlogyAnalayzer.java b/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyAnalayzer.java similarity index 76% rename from morph/src/main/java/org/apache/lucene/morphology/analayzer/RussianMorphlogyAnalayzer.java rename to morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyAnalayzer.java index 67c12ec..296d8cc 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/analayzer/RussianMorphlogyAnalayzer.java +++ b/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyAnalayzer.java @@ -22,21 +22,22 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.morphology.LuceneMorph; +import org.apache.lucene.morphology.LetterDecoderEncoder; import java.io.IOException; import java.io.Reader; -public class RussianMorphlogyAnalayzer extends Analyzer { +public class MorphlogyAnalayzer extends Analyzer { private LuceneMorph luceneMorph; - public RussianMorphlogyAnalayzer() throws IOException { - luceneMorph = new LuceneMorph("sep.txt"); + public MorphlogyAnalayzer(String pathToMorph,LetterDecoderEncoder letterDecoderEncoder) throws IOException { + luceneMorph = new LuceneMorph("sep.txt",letterDecoderEncoder); } public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); - return new RussianMorphlogyFilter(result, luceneMorph); + return new MorphlogyFilter(result, luceneMorph); } } diff --git a/morph/src/main/java/org/apache/lucene/morphology/analayzer/RussianMorphlogyFilter.java b/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyFilter.java similarity index 91% rename from morph/src/main/java/org/apache/lucene/morphology/analayzer/RussianMorphlogyFilter.java rename to morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyFilter.java index ca3a1a0..4f75ad5 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/analayzer/RussianMorphlogyFilter.java +++ b/morph/src/main/java/org/apache/lucene/morphology/analayzer/MorphlogyFilter.java @@ -26,10 +26,10 @@ import java.util.ArrayList; import java.util.List; -public class RussianMorphlogyFilter extends TokenFilter { +public class MorphlogyFilter extends TokenFilter { private LuceneMorph luceneMorph; - public RussianMorphlogyFilter(TokenStream tokenStream, LuceneMorph luceneMorph) { + public MorphlogyFilter(TokenStream tokenStream, LuceneMorph luceneMorph) { super(tokenStream); this.luceneMorph = luceneMorph; } @@ -54,6 +54,7 @@ public class RussianMorphlogyFilter extends TokenFilter { Token nextToken = input.next(reusableToken); if (nextToken == null) return null; // EOS; iterator exhausted Character testC = nextToken.term().charAt(0); + //todo check here for decoder endocoder if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC) { return nextToken; } diff --git a/russian/src/main/java/org/apache/lucene/morphology/russian/HeuristicBuilder.java b/russian/src/main/java/org/apache/lucene/morphology/russian/HeuristicBuilder.java index 9ac57d2..74839bb 100644 --- a/russian/src/main/java/org/apache/lucene/morphology/russian/HeuristicBuilder.java +++ b/russian/src/main/java/org/apache/lucene/morphology/russian/HeuristicBuilder.java @@ -31,7 +31,7 @@ public class HeuristicBuilder { GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab"); DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form); - RussianSuffixDecoderEncoder decoderEncoder = new RussianSuffixDecoderEncoder(); + RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder(); StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo, decoderEncoder); dictonaryReader.proccess(statiticsCollector); statiticsCollector.saveHeuristic(); diff --git a/russian/src/main/java/org/apache/lucene/morphology/russian/RussianSuffixDecoderEncoder.java b/russian/src/main/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoder.java similarity index 92% rename from russian/src/main/java/org/apache/lucene/morphology/russian/RussianSuffixDecoderEncoder.java rename to russian/src/main/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoder.java index d2f3947..2fd7f9b 100644 --- a/russian/src/main/java/org/apache/lucene/morphology/russian/RussianSuffixDecoderEncoder.java +++ b/russian/src/main/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoder.java @@ -28,7 +28,7 @@ import java.util.ArrayList; * Assumed that suffix contains only small russian letters and dash. * Also assumed that letter � and � coinsed. */ -public class RussianSuffixDecoderEncoder implements LetterDecoderEncoder { +public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071; static public int SUFFIX_LENGTH = 6; public static final int EE_CHAR = 34; @@ -107,6 +107,6 @@ public class RussianSuffixDecoderEncoder implements LetterDecoderEncoder { } public String cleanString(String s) { - return s.replace((char) (34 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET)); + return s.replace((char) (34 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET)); } } diff --git a/russian/src/main/java/org/apache/lucene/morphology/russian/Test.java b/russian/src/main/java/org/apache/lucene/morphology/russian/Test.java index ad972a3..a28c7a5 100644 --- a/russian/src/main/java/org/apache/lucene/morphology/russian/Test.java +++ b/russian/src/main/java/org/apache/lucene/morphology/russian/Test.java @@ -35,7 +35,7 @@ public class Test { public static void main(String[] args) throws IOException, ClassNotFoundException { // - Morph splitter = new Morph("sep.txt"); + Morph splitter = new Morph("sep.txt",new RussianLetterDecoderEncoder()); TreeSet shorts = new TreeSet(); int count = 0; TreeMap rulesStat = new TreeMap(); @@ -57,7 +57,6 @@ public class Test { System.out.println(count); System.out.println(rulesStat); System.gc(); - System.out.println("Ready"); System.in.read(); } } diff --git a/russian/src/test/java/org/apache/lucene/morphology/russian/AnalayzerTest.java b/russian/src/test/java/org/apache/lucene/morphology/russian/AnalayzerTest.java new file mode 100644 index 0000000..8fee84e --- /dev/null +++ b/russian/src/test/java/org/apache/lucene/morphology/russian/AnalayzerTest.java @@ -0,0 +1,11 @@ +package org.apache.lucene.morphology.russian; + +/** + * Created by IntelliJ IDEA. + * User: akuznetsov + * Date: 03/10/2009 + * Time: 3:52:43 PM + * To change this template use File | Settings | File Templates. + */ +public class AnalayzerTest { +} diff --git a/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoderTest.java b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoderTest.java new file mode 100644 index 0000000..0479c46 --- /dev/null +++ b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoderTest.java @@ -0,0 +1,30 @@ +package org.apache.lucene.morphology.russian; + +import junit.framework.TestCase; + + +public class RussianLetterDecoderEncoderTest extends TestCase { + public void testEncode() { + // Add your code here + } + + public void testEncodeToArray() { + // Add your code here + } + + public void testDecodeArray() { + // Add your code here + } + + public void testDecode() { + // Add your code here + } + + public void testCheckCharacter() { + // Add your code here + } + + public void testCleanString() { + // Add your code here + } +} diff --git a/russian/src/test/java/org/apache/lucene/morphology/russian/RussianMorphTest.java b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianMorphTest.java new file mode 100644 index 0000000..8aad6d8 --- /dev/null +++ b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianMorphTest.java @@ -0,0 +1,11 @@ +package org.apache.lucene.morphology.russian; + +/** + * Created by IntelliJ IDEA. + * User: akuznetsov + * Date: 03/10/2009 + * Time: 3:52:18 PM + * To change this template use File | Settings | File Templates. + */ +public class RussianMorphTest { +}