moving to lucene 3.0.0
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@92 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
36012f2943
commit
ebc367f16c
@ -15,8 +15,8 @@
|
||||
*/
|
||||
package org.apache.lucene.morphology.english;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.junit.Assert.assertThat;
|
||||
import org.junit.Test;
|
||||
@ -43,19 +43,12 @@ public class EnglishAnalayzerTest {
|
||||
stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-data.txt");
|
||||
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
final Token reusableToken = new Token();
|
||||
|
||||
Token nextToken;
|
||||
TokenStream in = morphlogyAnalayzer.tokenStream(null, reader);
|
||||
TokenStream tokenStream = morphlogyAnalayzer.tokenStream(null, reader);
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
for (; ;) {
|
||||
nextToken = in.next(reusableToken);
|
||||
|
||||
if (nextToken == null) {
|
||||
break;
|
||||
}
|
||||
|
||||
result.add(nextToken.term());
|
||||
while (tokenStream.incrementToken()) {
|
||||
TermAttribute attribute1 = tokenStream.getAttribute(TermAttribute.class);
|
||||
result.add(attribute1.term());
|
||||
}
|
||||
|
||||
stream.close();
|
||||
|
@ -1,5 +1,6 @@
|
||||
<?xml version="1.0"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<parent>
|
||||
<artifactId>morphology</artifactId>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
@ -13,11 +14,6 @@
|
||||
<url>http://maven.apache.org</url>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>3.8.1</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
</project>
|
||||
|
@ -23,6 +23,7 @@ import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||
import org.apache.lucene.morphology.LuceneMorphology;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
@ -44,7 +45,7 @@ public class MorphlogyAnalayzer extends Analyzer {
|
||||
}
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new StandardTokenizer(reader);
|
||||
TokenStream result = new StandardTokenizer(Version.LUCENE_30, reader);
|
||||
result = new StandardFilter(result);
|
||||
result = new LowerCaseFilter(result);
|
||||
return new MorphlogyFilter(result, luceneMorph);
|
||||
|
@ -16,67 +16,43 @@
|
||||
|
||||
package org.apache.lucene.morphology.analayzer;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.morphology.LuceneMorphology;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Iterator;
|
||||
|
||||
|
||||
public class MorphlogyFilter extends TokenFilter {
|
||||
private LuceneMorphology luceneMorph;
|
||||
private Iterator<String> iterator;
|
||||
private TermAttribute termAtt;
|
||||
|
||||
public MorphlogyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
|
||||
super(tokenStream);
|
||||
this.luceneMorph = luceneMorph;
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
|
||||
private List<String> stack = new ArrayList<String>();
|
||||
private int index = 0;
|
||||
private Token current = null;
|
||||
|
||||
/**
|
||||
* Returns the next token in the stream, or null at EOS.
|
||||
*/
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
while (index < stack.size()) { // pop from stack
|
||||
Token nextToken = createToken(stack.get(index++), current, reusableToken);
|
||||
if (nextToken != null) {
|
||||
return nextToken;
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (iterator == null || !iterator.hasNext()) {
|
||||
boolean b = input.incrementToken();
|
||||
if (!b) {
|
||||
return false;
|
||||
}
|
||||
String s = termAtt.term();
|
||||
if (luceneMorph.checkString(s)) {
|
||||
iterator = luceneMorph.getNormalForms(termAtt.term()).iterator();
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
String s = iterator.next();
|
||||
termAtt.setTermBuffer(s);
|
||||
return true;
|
||||
}
|
||||
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken == null) return null; // EOS; iterator exhausted
|
||||
if (!luceneMorph.checkString(nextToken.term())) {
|
||||
return nextToken;
|
||||
}
|
||||
stack = luceneMorph.getNormalForms(nextToken.term());
|
||||
index = 0;
|
||||
current = (Token) nextToken.clone();
|
||||
nextToken = createToken(stack.get(index++), current, reusableToken);
|
||||
return nextToken;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates and returns a token for the given synonym of the current input
|
||||
* token; Override for custom (stateless or stateful) behavior, if desired.
|
||||
*
|
||||
* @param synonym a synonym for the current token's term
|
||||
* @param current the current token from the underlying child stream
|
||||
* @param reusableToken the token to reuse
|
||||
* @return a new token, or null to indicate that the given synonym should be
|
||||
* ignored
|
||||
*/
|
||||
protected Token createToken(String synonym, Token current, final Token reusableToken) {
|
||||
reusableToken.reinit(current, synonym);
|
||||
reusableToken.setTermBuffer(synonym);
|
||||
reusableToken.setPositionIncrement(0);
|
||||
return reusableToken;
|
||||
}
|
||||
}
|
||||
|
5
pom.xml
5
pom.xml
@ -1,5 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>morphology</artifactId>
|
||||
@ -49,7 +50,7 @@
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-core</artifactId>
|
||||
<version>2.4.1</version>
|
||||
<version>3.0.0</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
@ -15,8 +15,8 @@
|
||||
*/
|
||||
package org.apache.lucene.morphology.russian;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.junit.Assert.assertThat;
|
||||
import org.junit.Test;
|
||||
@ -43,21 +43,12 @@ public class RussianAnalayzerTest {
|
||||
stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-data.txt");
|
||||
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
final Token reusableToken = new Token();
|
||||
|
||||
Token nextToken;
|
||||
TokenStream in = morphlogyAnalayzer.tokenStream(null, reader);
|
||||
TokenStream tokenStream = morphlogyAnalayzer.tokenStream(null, reader);
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
for (; ;) {
|
||||
nextToken = in.next(reusableToken);
|
||||
|
||||
if (nextToken == null) {
|
||||
break;
|
||||
}
|
||||
|
||||
result.add(nextToken.term());
|
||||
//
|
||||
|
||||
while (tokenStream.incrementToken()) {
|
||||
TermAttribute attribute1 = tokenStream.getAttribute(TermAttribute.class);
|
||||
result.add(attribute1.term());
|
||||
}
|
||||
|
||||
stream.close();
|
||||
|
Loading…
x
Reference in New Issue
Block a user