working on new version
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@51 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
@ -23,10 +23,9 @@ import java.util.List;
|
||||
|
||||
|
||||
public class LuceneMorph extends Morph {
|
||||
LetterDecoderEncoder decoderEncoder;
|
||||
|
||||
public LuceneMorph(String fileName) throws IOException {
|
||||
super(fileName);
|
||||
public LuceneMorph(String fileName,LetterDecoderEncoder decoderEncoder) throws IOException {
|
||||
super(fileName,decoderEncoder);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -29,11 +29,12 @@ public class Morph {
|
||||
protected short[] rulesId;
|
||||
protected Heuristic[][] rules;
|
||||
protected String[] grammaInfo;
|
||||
LetterDecoderEncoder decoderEncoder;
|
||||
protected LetterDecoderEncoder decoderEncoder;
|
||||
|
||||
|
||||
public Morph(String fileName) throws IOException {
|
||||
public Morph(String fileName,LetterDecoderEncoder decoderEncoder) throws IOException {
|
||||
readFromFile(fileName);
|
||||
this.decoderEncoder = decoderEncoder;
|
||||
}
|
||||
|
||||
public Morph(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
|
||||
|
@ -22,21 +22,22 @@ import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.morphology.LuceneMorph;
|
||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
public class RussianMorphlogyAnalayzer extends Analyzer {
|
||||
public class MorphlogyAnalayzer extends Analyzer {
|
||||
private LuceneMorph luceneMorph;
|
||||
|
||||
public RussianMorphlogyAnalayzer() throws IOException {
|
||||
luceneMorph = new LuceneMorph("sep.txt");
|
||||
public MorphlogyAnalayzer(String pathToMorph,LetterDecoderEncoder letterDecoderEncoder) throws IOException {
|
||||
luceneMorph = new LuceneMorph("sep.txt",letterDecoderEncoder);
|
||||
}
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new StandardTokenizer(reader);
|
||||
result = new StandardFilter(result);
|
||||
result = new LowerCaseFilter(result);
|
||||
return new RussianMorphlogyFilter(result, luceneMorph);
|
||||
return new MorphlogyFilter(result, luceneMorph);
|
||||
}
|
||||
}
|
@ -26,10 +26,10 @@ import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class RussianMorphlogyFilter extends TokenFilter {
|
||||
public class MorphlogyFilter extends TokenFilter {
|
||||
private LuceneMorph luceneMorph;
|
||||
|
||||
public RussianMorphlogyFilter(TokenStream tokenStream, LuceneMorph luceneMorph) {
|
||||
public MorphlogyFilter(TokenStream tokenStream, LuceneMorph luceneMorph) {
|
||||
super(tokenStream);
|
||||
this.luceneMorph = luceneMorph;
|
||||
}
|
||||
@ -54,6 +54,7 @@ public class RussianMorphlogyFilter extends TokenFilter {
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken == null) return null; // EOS; iterator exhausted
|
||||
Character testC = nextToken.term().charAt(0);
|
||||
//todo check here for decoder endocoder
|
||||
if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC) {
|
||||
return nextToken;
|
||||
}
|
Reference in New Issue
Block a user