Adding support of lucene 5.0.0
This commit is contained in:
parent
2b5509c825
commit
d300938502
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
target
|
||||||
|
.idea
|
||||||
|
*.iml
|
@ -19,7 +19,6 @@ import org.apache.lucene.analysis.Analyzer;
|
|||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.morphology.english.EnglishAnalyzer;
|
import org.apache.lucene.morphology.english.EnglishAnalyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.morphology.russian.RussianAnalyzer;
|
import org.apache.lucene.morphology.russian.RussianAnalyzer;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
@ -66,6 +65,7 @@ public class AnalyzersTest {
|
|||||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||||
|
|
||||||
TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
|
TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
|
||||||
|
tokenStream.reset();
|
||||||
HashSet<String> result = new HashSet<String>();
|
HashSet<String> result = new HashSet<String>();
|
||||||
while (tokenStream.incrementToken()) {
|
while (tokenStream.incrementToken()) {
|
||||||
CharTermAttribute attribute1 = tokenStream.getAttribute(CharTermAttribute.class);
|
CharTermAttribute attribute1 = tokenStream.getAttribute(CharTermAttribute.class);
|
||||||
|
@ -25,6 +25,7 @@ import java.io.InputStream;
|
|||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import static org.hamcrest.CoreMatchers.equalTo;
|
import static org.hamcrest.CoreMatchers.equalTo;
|
||||||
@ -43,6 +44,8 @@ public class LuceneMorphTest {
|
|||||||
@Test
|
@Test
|
||||||
public void russianMorphologyShouldGetCorrectNormalForm() throws IOException {
|
public void russianMorphologyShouldGetCorrectNormalForm() throws IOException {
|
||||||
LuceneMorphology luceneMorph = new RussianLuceneMorphology();
|
LuceneMorphology luceneMorph = new RussianLuceneMorphology();
|
||||||
|
List<String> v = luceneMorph.getMorphInfo("вина");
|
||||||
|
System.out.println(v);
|
||||||
String pathToTestData = "/russian/russian-morphology-test.txt";
|
String pathToTestData = "/russian/russian-morphology-test.txt";
|
||||||
testMorphology(luceneMorph, pathToTestData);
|
testMorphology(luceneMorph, pathToTestData);
|
||||||
}
|
}
|
||||||
|
@ -17,12 +17,17 @@
|
|||||||
package org.apache.lucene.morphology.analyzer;
|
package org.apache.lucene.morphology.analyzer;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.payloads.PayloadEncoder;
|
||||||
|
import org.apache.lucene.analysis.payloads.PayloadHelper;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||||
import org.apache.lucene.morphology.LuceneMorphology;
|
import org.apache.lucene.morphology.LuceneMorphology;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -44,15 +49,39 @@ public class MorphologyAnalyzer extends Analyzer {
|
|||||||
luceneMorph = new LuceneMorphology(inputStream, letterDecoderEncoder);
|
luceneMorph = new LuceneMorphology(inputStream, letterDecoderEncoder);
|
||||||
}
|
}
|
||||||
|
|
||||||
final public TokenStream tokenStream(String fieldName, Reader reader) {
|
|
||||||
TokenStream result = new StandardTokenizer(Version.LUCENE_35, reader);
|
@Override
|
||||||
result = new StandardFilter(Version.LUCENE_35,result);
|
protected TokenStreamComponents createComponents(String s) {
|
||||||
result = new LowerCaseFilter(Version.LUCENE_35,result);
|
|
||||||
return new MorphologyFilter(result, luceneMorph);
|
StandardTokenizer src = new StandardTokenizer();
|
||||||
|
final PayloadEncoder encoder = new PayloadEncoder() {
|
||||||
|
@Override
|
||||||
|
public BytesRef encode(char[] buffer) {
|
||||||
|
final Float payload = Float.valueOf(new String(buffer));
|
||||||
|
System.out.println(payload);
|
||||||
|
final byte[] bytes = PayloadHelper.encodeFloat(payload);
|
||||||
|
return new BytesRef(bytes, 0, bytes.length);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
final public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
public BytesRef encode(char[] buffer, int offset, int length) {
|
||||||
return super.reusableTokenStream(fieldName, reader);
|
|
||||||
|
final Float payload = Float.valueOf(new String(buffer, offset, length));
|
||||||
|
System.out.println(payload);
|
||||||
|
final byte[] bytes = PayloadHelper.encodeFloat(payload);
|
||||||
|
|
||||||
|
return new BytesRef(bytes, 0, bytes.length);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
TokenFilter filter = new StandardFilter(src);
|
||||||
|
filter = new LowerCaseFilter(filter);
|
||||||
|
filter = new MorphologyFilter(filter, luceneMorph);
|
||||||
|
|
||||||
|
return new TokenStreamComponents(src, filter) {
|
||||||
|
@Override
|
||||||
|
protected void setReader(final Reader reader) throws IOException {
|
||||||
|
super.setReader(reader);
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -19,7 +19,6 @@ package org.apache.lucene.morphology.analyzer;
|
|||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.morphology.LuceneMorphology;
|
import org.apache.lucene.morphology.LuceneMorphology;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
9
pom.xml
9
pom.xml
@ -49,8 +49,15 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.lucene</groupId>
|
<groupId>org.apache.lucene</groupId>
|
||||||
<artifactId>lucene-core</artifactId>
|
<artifactId>lucene-core</artifactId>
|
||||||
<version>3.5.0</version>
|
<version>5.0.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.lucene</groupId>
|
||||||
|
<artifactId>lucene-analyzers-common</artifactId>
|
||||||
|
<version>5.0.0</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
<repositories>
|
<repositories>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user