moving to lucene 3.0.0

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@92 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov 2010-02-22 13:49:47 +00:00
parent 36012f2943
commit ebc367f16c
6 changed files with 37 additions and 79 deletions

View File

@ -15,8 +15,8 @@
*/ */
package org.apache.lucene.morphology.english; package org.apache.lucene.morphology.english;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.equalTo;
import static org.junit.Assert.assertThat; import static org.junit.Assert.assertThat;
import org.junit.Test; import org.junit.Test;
@ -43,19 +43,12 @@ public class EnglishAnalayzerTest {
stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-data.txt"); stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-data.txt");
InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
final Token reusableToken = new Token();
Token nextToken; TokenStream tokenStream = morphlogyAnalayzer.tokenStream(null, reader);
TokenStream in = morphlogyAnalayzer.tokenStream(null, reader);
HashSet<String> result = new HashSet<String>(); HashSet<String> result = new HashSet<String>();
for (; ;) { while (tokenStream.incrementToken()) {
nextToken = in.next(reusableToken); TermAttribute attribute1 = tokenStream.getAttribute(TermAttribute.class);
result.add(attribute1.term());
if (nextToken == null) {
break;
}
result.add(nextToken.term());
} }
stream.close(); stream.close();

View File

@ -1,5 +1,6 @@
<?xml version="1.0"?> <?xml version="1.0"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<parent> <parent>
<artifactId>morphology</artifactId> <artifactId>morphology</artifactId>
<groupId>org.apache.lucene.morphology</groupId> <groupId>org.apache.lucene.morphology</groupId>
@ -13,11 +14,6 @@
<url>http://maven.apache.org</url> <url>http://maven.apache.org</url>
<dependencies> <dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
</dependencies> </dependencies>
</project> </project>

View File

@ -23,6 +23,7 @@ import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.morphology.LetterDecoderEncoder; import org.apache.lucene.morphology.LetterDecoderEncoder;
import org.apache.lucene.morphology.LuceneMorphology; import org.apache.lucene.morphology.LuceneMorphology;
import org.apache.lucene.util.Version;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
@ -44,7 +45,7 @@ public class MorphlogyAnalayzer extends Analyzer {
} }
public TokenStream tokenStream(String fieldName, Reader reader) { public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(reader); TokenStream result = new StandardTokenizer(Version.LUCENE_30, reader);
result = new StandardFilter(result); result = new StandardFilter(result);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(result);
return new MorphlogyFilter(result, luceneMorph); return new MorphlogyFilter(result, luceneMorph);

View File

@ -16,67 +16,43 @@
package org.apache.lucene.morphology.analayzer; package org.apache.lucene.morphology.analayzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.morphology.LuceneMorphology; import org.apache.lucene.morphology.LuceneMorphology;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.Iterator;
import java.util.List;
public class MorphlogyFilter extends TokenFilter { public class MorphlogyFilter extends TokenFilter {
private LuceneMorphology luceneMorph; private LuceneMorphology luceneMorph;
private Iterator<String> iterator;
private TermAttribute termAtt;
public MorphlogyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) { public MorphlogyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
super(tokenStream); super(tokenStream);
this.luceneMorph = luceneMorph; this.luceneMorph = luceneMorph;
termAtt = addAttribute(TermAttribute.class);
} }
private List<String> stack = new ArrayList<String>(); public boolean incrementToken() throws IOException {
private int index = 0; while (iterator == null || !iterator.hasNext()) {
private Token current = null; boolean b = input.incrementToken();
if (!b) {
/** return false;
* Returns the next token in the stream, or null at EOS. }
*/ String s = termAtt.term();
public Token next(final Token reusableToken) throws IOException { if (luceneMorph.checkString(s)) {
assert reusableToken != null; iterator = luceneMorph.getNormalForms(termAtt.term()).iterator();
while (index < stack.size()) { // pop from stack } else {
Token nextToken = createToken(stack.get(index++), current, reusableToken); return true;
if (nextToken != null) {
return nextToken;
} }
} }
String s = iterator.next();
Token nextToken = input.next(reusableToken); termAtt.setTermBuffer(s);
if (nextToken == null) return null; // EOS; iterator exhausted return true;
if (!luceneMorph.checkString(nextToken.term())) {
return nextToken;
}
stack = luceneMorph.getNormalForms(nextToken.term());
index = 0;
current = (Token) nextToken.clone();
nextToken = createToken(stack.get(index++), current, reusableToken);
return nextToken;
} }
/**
* Creates and returns a token for the given synonym of the current input
* token; Override for custom (stateless or stateful) behavior, if desired.
*
* @param synonym a synonym for the current token's term
* @param current the current token from the underlying child stream
* @param reusableToken the token to reuse
* @return a new token, or null to indicate that the given synonym should be
* ignored
*/
protected Token createToken(String synonym, Token current, final Token reusableToken) {
reusableToken.reinit(current, synonym);
reusableToken.setTermBuffer(synonym);
reusableToken.setPositionIncrement(0);
return reusableToken;
}
} }

View File

@ -1,5 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>org.apache.lucene.morphology</groupId> <groupId>org.apache.lucene.morphology</groupId>
<artifactId>morphology</artifactId> <artifactId>morphology</artifactId>
@ -49,7 +50,7 @@
<dependency> <dependency>
<groupId>org.apache.lucene</groupId> <groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId> <artifactId>lucene-core</artifactId>
<version>2.4.1</version> <version>3.0.0</version>
</dependency> </dependency>
</dependencies> </dependencies>

View File

@ -15,8 +15,8 @@
*/ */
package org.apache.lucene.morphology.russian; package org.apache.lucene.morphology.russian;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.equalTo;
import static org.junit.Assert.assertThat; import static org.junit.Assert.assertThat;
import org.junit.Test; import org.junit.Test;
@ -43,21 +43,12 @@ public class RussianAnalayzerTest {
stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-data.txt"); stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-data.txt");
InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
final Token reusableToken = new Token();
Token nextToken; TokenStream tokenStream = morphlogyAnalayzer.tokenStream(null, reader);
TokenStream in = morphlogyAnalayzer.tokenStream(null, reader);
HashSet<String> result = new HashSet<String>(); HashSet<String> result = new HashSet<String>();
for (; ;) { while (tokenStream.incrementToken()) {
nextToken = in.next(reusableToken); TermAttribute attribute1 = tokenStream.getAttribute(TermAttribute.class);
result.add(attribute1.term());
if (nextToken == null) {
break;
}
result.add(nextToken.term());
//
} }
stream.close(); stream.close();