Adding support of lucene 5.0.0
This commit is contained in:
parent
2b5509c825
commit
d300938502
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
target
|
||||
.idea
|
||||
*.iml
|
@ -19,7 +19,6 @@ import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.morphology.english.EnglishAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.morphology.russian.RussianAnalyzer;
|
||||
import org.junit.Test;
|
||||
|
||||
@ -66,6 +65,7 @@ public class AnalyzersTest {
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
|
||||
TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
|
||||
tokenStream.reset();
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
while (tokenStream.incrementToken()) {
|
||||
CharTermAttribute attribute1 = tokenStream.getAttribute(CharTermAttribute.class);
|
||||
|
@ -25,6 +25,7 @@ import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import static org.hamcrest.CoreMatchers.equalTo;
|
||||
@ -43,6 +44,8 @@ public class LuceneMorphTest {
|
||||
@Test
|
||||
public void russianMorphologyShouldGetCorrectNormalForm() throws IOException {
|
||||
LuceneMorphology luceneMorph = new RussianLuceneMorphology();
|
||||
List<String> v = luceneMorph.getMorphInfo("вина");
|
||||
System.out.println(v);
|
||||
String pathToTestData = "/russian/russian-morphology-test.txt";
|
||||
testMorphology(luceneMorph, pathToTestData);
|
||||
}
|
||||
|
@ -17,12 +17,17 @@
|
||||
package org.apache.lucene.morphology.analyzer;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
|
||||
import org.apache.lucene.analysis.payloads.PayloadEncoder;
|
||||
import org.apache.lucene.analysis.payloads.PayloadHelper;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||
import org.apache.lucene.morphology.LuceneMorphology;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
@ -44,15 +49,39 @@ public class MorphologyAnalyzer extends Analyzer {
|
||||
luceneMorph = new LuceneMorphology(inputStream, letterDecoderEncoder);
|
||||
}
|
||||
|
||||
final public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new StandardTokenizer(Version.LUCENE_35, reader);
|
||||
result = new StandardFilter(Version.LUCENE_35,result);
|
||||
result = new LowerCaseFilter(Version.LUCENE_35,result);
|
||||
return new MorphologyFilter(result, luceneMorph);
|
||||
}
|
||||
|
||||
@Override
|
||||
final public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
return super.reusableTokenStream(fieldName, reader);
|
||||
protected TokenStreamComponents createComponents(String s) {
|
||||
|
||||
StandardTokenizer src = new StandardTokenizer();
|
||||
final PayloadEncoder encoder = new PayloadEncoder() {
|
||||
@Override
|
||||
public BytesRef encode(char[] buffer) {
|
||||
final Float payload = Float.valueOf(new String(buffer));
|
||||
System.out.println(payload);
|
||||
final byte[] bytes = PayloadHelper.encodeFloat(payload);
|
||||
return new BytesRef(bytes, 0, bytes.length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef encode(char[] buffer, int offset, int length) {
|
||||
|
||||
final Float payload = Float.valueOf(new String(buffer, offset, length));
|
||||
System.out.println(payload);
|
||||
final byte[] bytes = PayloadHelper.encodeFloat(payload);
|
||||
|
||||
return new BytesRef(bytes, 0, bytes.length);
|
||||
}
|
||||
};
|
||||
TokenFilter filter = new StandardFilter(src);
|
||||
filter = new LowerCaseFilter(filter);
|
||||
filter = new MorphologyFilter(filter, luceneMorph);
|
||||
|
||||
return new TokenStreamComponents(src, filter) {
|
||||
@Override
|
||||
protected void setReader(final Reader reader) throws IOException {
|
||||
super.setReader(reader);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
@ -19,7 +19,6 @@ package org.apache.lucene.morphology.analyzer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.morphology.LuceneMorphology;
|
||||
|
||||
import java.io.IOException;
|
||||
|
9
pom.xml
9
pom.xml
@ -49,8 +49,15 @@
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-core</artifactId>
|
||||
<version>3.5.0</version>
|
||||
<version>5.0.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-analyzers-common</artifactId>
|
||||
<version>5.0.0</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
<repositories>
|
||||
|
Loading…
x
Reference in New Issue
Block a user