working on new version
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@51 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
ca1a9be6b7
commit
fed6cd480a
@ -23,10 +23,9 @@ import java.util.List;
|
|||||||
|
|
||||||
|
|
||||||
public class LuceneMorph extends Morph {
|
public class LuceneMorph extends Morph {
|
||||||
LetterDecoderEncoder decoderEncoder;
|
|
||||||
|
|
||||||
public LuceneMorph(String fileName) throws IOException {
|
public LuceneMorph(String fileName,LetterDecoderEncoder decoderEncoder) throws IOException {
|
||||||
super(fileName);
|
super(fileName,decoderEncoder);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -29,11 +29,12 @@ public class Morph {
|
|||||||
protected short[] rulesId;
|
protected short[] rulesId;
|
||||||
protected Heuristic[][] rules;
|
protected Heuristic[][] rules;
|
||||||
protected String[] grammaInfo;
|
protected String[] grammaInfo;
|
||||||
LetterDecoderEncoder decoderEncoder;
|
protected LetterDecoderEncoder decoderEncoder;
|
||||||
|
|
||||||
|
|
||||||
public Morph(String fileName) throws IOException {
|
public Morph(String fileName,LetterDecoderEncoder decoderEncoder) throws IOException {
|
||||||
readFromFile(fileName);
|
readFromFile(fileName);
|
||||||
|
this.decoderEncoder = decoderEncoder;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Morph(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
|
public Morph(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
|
||||||
|
@ -22,21 +22,22 @@ import org.apache.lucene.analysis.TokenStream;
|
|||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.morphology.LuceneMorph;
|
import org.apache.lucene.morphology.LuceneMorph;
|
||||||
|
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
public class RussianMorphlogyAnalayzer extends Analyzer {
|
public class MorphlogyAnalayzer extends Analyzer {
|
||||||
private LuceneMorph luceneMorph;
|
private LuceneMorph luceneMorph;
|
||||||
|
|
||||||
public RussianMorphlogyAnalayzer() throws IOException {
|
public MorphlogyAnalayzer(String pathToMorph,LetterDecoderEncoder letterDecoderEncoder) throws IOException {
|
||||||
luceneMorph = new LuceneMorph("sep.txt");
|
luceneMorph = new LuceneMorph("sep.txt",letterDecoderEncoder);
|
||||||
}
|
}
|
||||||
|
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
TokenStream result = new StandardTokenizer(reader);
|
TokenStream result = new StandardTokenizer(reader);
|
||||||
result = new StandardFilter(result);
|
result = new StandardFilter(result);
|
||||||
result = new LowerCaseFilter(result);
|
result = new LowerCaseFilter(result);
|
||||||
return new RussianMorphlogyFilter(result, luceneMorph);
|
return new MorphlogyFilter(result, luceneMorph);
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -26,10 +26,10 @@ import java.util.ArrayList;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
public class RussianMorphlogyFilter extends TokenFilter {
|
public class MorphlogyFilter extends TokenFilter {
|
||||||
private LuceneMorph luceneMorph;
|
private LuceneMorph luceneMorph;
|
||||||
|
|
||||||
public RussianMorphlogyFilter(TokenStream tokenStream, LuceneMorph luceneMorph) {
|
public MorphlogyFilter(TokenStream tokenStream, LuceneMorph luceneMorph) {
|
||||||
super(tokenStream);
|
super(tokenStream);
|
||||||
this.luceneMorph = luceneMorph;
|
this.luceneMorph = luceneMorph;
|
||||||
}
|
}
|
||||||
@ -54,6 +54,7 @@ public class RussianMorphlogyFilter extends TokenFilter {
|
|||||||
Token nextToken = input.next(reusableToken);
|
Token nextToken = input.next(reusableToken);
|
||||||
if (nextToken == null) return null; // EOS; iterator exhausted
|
if (nextToken == null) return null; // EOS; iterator exhausted
|
||||||
Character testC = nextToken.term().charAt(0);
|
Character testC = nextToken.term().charAt(0);
|
||||||
|
//todo check here for decoder endocoder
|
||||||
if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC) {
|
if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC) {
|
||||||
return nextToken;
|
return nextToken;
|
||||||
}
|
}
|
@ -31,7 +31,7 @@ public class HeuristicBuilder {
|
|||||||
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
|
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
|
||||||
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form);
|
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form);
|
||||||
|
|
||||||
RussianSuffixDecoderEncoder decoderEncoder = new RussianSuffixDecoderEncoder();
|
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
||||||
StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo, decoderEncoder);
|
StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo, decoderEncoder);
|
||||||
dictonaryReader.proccess(statiticsCollector);
|
dictonaryReader.proccess(statiticsCollector);
|
||||||
statiticsCollector.saveHeuristic();
|
statiticsCollector.saveHeuristic();
|
||||||
|
@ -28,7 +28,7 @@ import java.util.ArrayList;
|
|||||||
* Assumed that suffix contains only small russian letters and dash.
|
* Assumed that suffix contains only small russian letters and dash.
|
||||||
* Also assumed that letter <EFBFBD> and <EFBFBD> coinsed.
|
* Also assumed that letter <EFBFBD> and <EFBFBD> coinsed.
|
||||||
*/
|
*/
|
||||||
public class RussianSuffixDecoderEncoder implements LetterDecoderEncoder {
|
public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
|
||||||
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
|
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
|
||||||
static public int SUFFIX_LENGTH = 6;
|
static public int SUFFIX_LENGTH = 6;
|
||||||
public static final int EE_CHAR = 34;
|
public static final int EE_CHAR = 34;
|
||||||
@ -107,6 +107,6 @@ public class RussianSuffixDecoderEncoder implements LetterDecoderEncoder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public String cleanString(String s) {
|
public String cleanString(String s) {
|
||||||
return s.replace((char) (34 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
|
return s.replace((char) (34 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -35,7 +35,7 @@ public class Test {
|
|||||||
|
|
||||||
public static void main(String[] args) throws IOException, ClassNotFoundException {
|
public static void main(String[] args) throws IOException, ClassNotFoundException {
|
||||||
//
|
//
|
||||||
Morph splitter = new Morph("sep.txt");
|
Morph splitter = new Morph("sep.txt",new RussianLetterDecoderEncoder());
|
||||||
TreeSet<Short> shorts = new TreeSet<Short>();
|
TreeSet<Short> shorts = new TreeSet<Short>();
|
||||||
int count = 0;
|
int count = 0;
|
||||||
TreeMap<Integer, Integer> rulesStat = new TreeMap<Integer, Integer>();
|
TreeMap<Integer, Integer> rulesStat = new TreeMap<Integer, Integer>();
|
||||||
@ -57,7 +57,6 @@ public class Test {
|
|||||||
System.out.println(count);
|
System.out.println(count);
|
||||||
System.out.println(rulesStat);
|
System.out.println(rulesStat);
|
||||||
System.gc();
|
System.gc();
|
||||||
System.out.println("Ready");
|
|
||||||
System.in.read();
|
System.in.read();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,11 @@
|
|||||||
|
package org.apache.lucene.morphology.russian;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: akuznetsov
|
||||||
|
* Date: 03/10/2009
|
||||||
|
* Time: 3:52:43 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class AnalayzerTest {
|
||||||
|
}
|
@ -0,0 +1,30 @@
|
|||||||
|
package org.apache.lucene.morphology.russian;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
|
||||||
|
public class RussianLetterDecoderEncoderTest extends TestCase {
|
||||||
|
public void testEncode() {
|
||||||
|
// Add your code here
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEncodeToArray() {
|
||||||
|
// Add your code here
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDecodeArray() {
|
||||||
|
// Add your code here
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDecode() {
|
||||||
|
// Add your code here
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testCheckCharacter() {
|
||||||
|
// Add your code here
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testCleanString() {
|
||||||
|
// Add your code here
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,11 @@
|
|||||||
|
package org.apache.lucene.morphology.russian;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: akuznetsov
|
||||||
|
* Date: 03/10/2009
|
||||||
|
* Time: 3:52:18 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class RussianMorphTest {
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user