moving to lucene 3.0.0
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@92 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
36012f2943
commit
ebc367f16c
@ -15,8 +15,8 @@
|
|||||||
*/
|
*/
|
||||||
package org.apache.lucene.morphology.english;
|
package org.apache.lucene.morphology.english;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import static org.hamcrest.Matchers.equalTo;
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
import static org.junit.Assert.assertThat;
|
import static org.junit.Assert.assertThat;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
@ -43,19 +43,12 @@ public class EnglishAnalayzerTest {
|
|||||||
stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-data.txt");
|
stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/englsih-analayzer-data.txt");
|
||||||
|
|
||||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||||
final Token reusableToken = new Token();
|
|
||||||
|
|
||||||
Token nextToken;
|
TokenStream tokenStream = morphlogyAnalayzer.tokenStream(null, reader);
|
||||||
TokenStream in = morphlogyAnalayzer.tokenStream(null, reader);
|
|
||||||
HashSet<String> result = new HashSet<String>();
|
HashSet<String> result = new HashSet<String>();
|
||||||
for (; ;) {
|
while (tokenStream.incrementToken()) {
|
||||||
nextToken = in.next(reusableToken);
|
TermAttribute attribute1 = tokenStream.getAttribute(TermAttribute.class);
|
||||||
|
result.add(attribute1.term());
|
||||||
if (nextToken == null) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
result.add(nextToken.term());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
stream.close();
|
stream.close();
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
<?xml version="1.0"?>
|
<?xml version="1.0"?>
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>morphology</artifactId>
|
<artifactId>morphology</artifactId>
|
||||||
<groupId>org.apache.lucene.morphology</groupId>
|
<groupId>org.apache.lucene.morphology</groupId>
|
||||||
@ -13,11 +14,6 @@
|
|||||||
<url>http://maven.apache.org</url>
|
<url>http://maven.apache.org</url>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
|
||||||
<groupId>junit</groupId>
|
|
||||||
<artifactId>junit</artifactId>
|
|
||||||
<version>3.8.1</version>
|
|
||||||
<scope>test</scope>
|
|
||||||
</dependency>
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</project>
|
</project>
|
||||||
|
@ -23,6 +23,7 @@ import org.apache.lucene.analysis.standard.StandardFilter;
|
|||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||||
import org.apache.lucene.morphology.LuceneMorphology;
|
import org.apache.lucene.morphology.LuceneMorphology;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
@ -44,7 +45,7 @@ public class MorphlogyAnalayzer extends Analyzer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
TokenStream result = new StandardTokenizer(reader);
|
TokenStream result = new StandardTokenizer(Version.LUCENE_30, reader);
|
||||||
result = new StandardFilter(result);
|
result = new StandardFilter(result);
|
||||||
result = new LowerCaseFilter(result);
|
result = new LowerCaseFilter(result);
|
||||||
return new MorphlogyFilter(result, luceneMorph);
|
return new MorphlogyFilter(result, luceneMorph);
|
||||||
|
@ -16,67 +16,43 @@
|
|||||||
|
|
||||||
package org.apache.lucene.morphology.analayzer;
|
package org.apache.lucene.morphology.analayzer;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.morphology.LuceneMorphology;
|
import org.apache.lucene.morphology.LuceneMorphology;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
|
|
||||||
public class MorphlogyFilter extends TokenFilter {
|
public class MorphlogyFilter extends TokenFilter {
|
||||||
private LuceneMorphology luceneMorph;
|
private LuceneMorphology luceneMorph;
|
||||||
|
private Iterator<String> iterator;
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
public MorphlogyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
|
public MorphlogyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
|
||||||
super(tokenStream);
|
super(tokenStream);
|
||||||
this.luceneMorph = luceneMorph;
|
this.luceneMorph = luceneMorph;
|
||||||
|
termAtt = addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<String> stack = new ArrayList<String>();
|
public boolean incrementToken() throws IOException {
|
||||||
private int index = 0;
|
while (iterator == null || !iterator.hasNext()) {
|
||||||
private Token current = null;
|
boolean b = input.incrementToken();
|
||||||
|
if (!b) {
|
||||||
/**
|
return false;
|
||||||
* Returns the next token in the stream, or null at EOS.
|
|
||||||
*/
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
|
||||||
assert reusableToken != null;
|
|
||||||
while (index < stack.size()) { // pop from stack
|
|
||||||
Token nextToken = createToken(stack.get(index++), current, reusableToken);
|
|
||||||
if (nextToken != null) {
|
|
||||||
return nextToken;
|
|
||||||
}
|
}
|
||||||
|
String s = termAtt.term();
|
||||||
|
if (luceneMorph.checkString(s)) {
|
||||||
|
iterator = luceneMorph.getNormalForms(termAtt.term()).iterator();
|
||||||
|
} else {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
String s = iterator.next();
|
||||||
|
termAtt.setTermBuffer(s);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
Token nextToken = input.next(reusableToken);
|
|
||||||
if (nextToken == null) return null; // EOS; iterator exhausted
|
|
||||||
if (!luceneMorph.checkString(nextToken.term())) {
|
|
||||||
return nextToken;
|
|
||||||
}
|
|
||||||
stack = luceneMorph.getNormalForms(nextToken.term());
|
|
||||||
index = 0;
|
|
||||||
current = (Token) nextToken.clone();
|
|
||||||
nextToken = createToken(stack.get(index++), current, reusableToken);
|
|
||||||
return nextToken;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates and returns a token for the given synonym of the current input
|
|
||||||
* token; Override for custom (stateless or stateful) behavior, if desired.
|
|
||||||
*
|
|
||||||
* @param synonym a synonym for the current token's term
|
|
||||||
* @param current the current token from the underlying child stream
|
|
||||||
* @param reusableToken the token to reuse
|
|
||||||
* @return a new token, or null to indicate that the given synonym should be
|
|
||||||
* ignored
|
|
||||||
*/
|
|
||||||
protected Token createToken(String synonym, Token current, final Token reusableToken) {
|
|
||||||
reusableToken.reinit(current, synonym);
|
|
||||||
reusableToken.setTermBuffer(synonym);
|
|
||||||
reusableToken.setPositionIncrement(0);
|
|
||||||
return reusableToken;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
5
pom.xml
5
pom.xml
@ -1,5 +1,6 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<groupId>org.apache.lucene.morphology</groupId>
|
<groupId>org.apache.lucene.morphology</groupId>
|
||||||
<artifactId>morphology</artifactId>
|
<artifactId>morphology</artifactId>
|
||||||
@ -49,7 +50,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.lucene</groupId>
|
<groupId>org.apache.lucene</groupId>
|
||||||
<artifactId>lucene-core</artifactId>
|
<artifactId>lucene-core</artifactId>
|
||||||
<version>2.4.1</version>
|
<version>3.0.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
@ -15,8 +15,8 @@
|
|||||||
*/
|
*/
|
||||||
package org.apache.lucene.morphology.russian;
|
package org.apache.lucene.morphology.russian;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import static org.hamcrest.Matchers.equalTo;
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
import static org.junit.Assert.assertThat;
|
import static org.junit.Assert.assertThat;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
@ -43,21 +43,12 @@ public class RussianAnalayzerTest {
|
|||||||
stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-data.txt");
|
stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-analayzer-data.txt");
|
||||||
|
|
||||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||||
final Token reusableToken = new Token();
|
|
||||||
|
|
||||||
Token nextToken;
|
TokenStream tokenStream = morphlogyAnalayzer.tokenStream(null, reader);
|
||||||
TokenStream in = morphlogyAnalayzer.tokenStream(null, reader);
|
|
||||||
HashSet<String> result = new HashSet<String>();
|
HashSet<String> result = new HashSet<String>();
|
||||||
for (; ;) {
|
while (tokenStream.incrementToken()) {
|
||||||
nextToken = in.next(reusableToken);
|
TermAttribute attribute1 = tokenStream.getAttribute(TermAttribute.class);
|
||||||
|
result.add(attribute1.term());
|
||||||
if (nextToken == null) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
result.add(nextToken.term());
|
|
||||||
//
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
stream.close();
|
stream.close();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user