Adding new lucene model of analayser
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@48 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
394fb6a621
commit
786ce92ae0
@ -38,59 +38,5 @@ public class HeuristicBuilder {
|
|||||||
dictonaryReader.proccess(statiticsCollector);
|
dictonaryReader.proccess(statiticsCollector);
|
||||||
statiticsCollector.saveHeuristic();
|
statiticsCollector.saveHeuristic();
|
||||||
|
|
||||||
|
|
||||||
// StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read());
|
|
||||||
// dictonaryReader.proccess(statiticsCollectors);
|
|
||||||
// Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
|
|
||||||
// Object[] objects = counterCollection.toArray();
|
|
||||||
// Arrays.sort(objects);
|
|
||||||
// System.out.println("Length " + objects.length + " ingored words " + statiticsCollectors.getIgnoredCount());
|
|
||||||
// for (int i = 0; i < 10; i++) {
|
|
||||||
// System.out.println(objects[i]);
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// final HeuristicBySuffixLegth heuristic = new HeuristicBySuffixLegth();
|
|
||||||
// for (int i = 0; i < objects.length; i++) {
|
|
||||||
// heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic());
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// System.out.println("Single suffix " + heuristic.getSingleSuffixes().size());
|
|
||||||
// System.out.println("diffiren morgh " + heuristic.getWordWithMorphology().size());
|
|
||||||
// System.out.println("Ononims " + heuristic.getOnonyms().size());
|
|
||||||
// final Map<Long, Set<SimpleSuffixHeuristic>> map = heuristic.getUnkowns();
|
|
||||||
// System.out.println("Unknow suffix " + map.size());
|
|
||||||
// int cont = 0;
|
|
||||||
// for (Set<SimpleSuffixHeuristic> st : map.values()) {
|
|
||||||
//
|
|
||||||
// if (cont > 50) break;
|
|
||||||
// if (st.size() < 3) {
|
|
||||||
// System.out.println(st);
|
|
||||||
// cont++;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// //final RussianSuffixDecoderEncoder decoderEncoder = new RussianSuffixDecoderEncoder(6);
|
|
||||||
// final AtomicLong c = new AtomicLong(0L);
|
|
||||||
// final AtomicLong all = new AtomicLong(0L);
|
|
||||||
// dictonaryReader.proccess(
|
|
||||||
// new WordProccessor() {
|
|
||||||
// public void proccess(WordCard wordCard) throws IOException {
|
|
||||||
// for (FlexiaModel fm : wordCard.getWordsFroms()) {
|
|
||||||
// String form = fm.create(wordCard.getBase());
|
|
||||||
// if(form.startsWith("ïðèê") && form.endsWith("üÿ")) System.out.println(form);
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
|
||||||
// String formSuffix = form.substring(startSymbol);
|
|
||||||
// Long aLong = RussianSuffixDecoderEncoder.encode(formSuffix);
|
|
||||||
// all.incrementAndGet();
|
|
||||||
// if (map.containsKey(aLong)) c.incrementAndGet();
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// );
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// System.out.println("Ankown words " + all.longValue());
|
|
||||||
// System.out.println("Ankown words " + c.longValue());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -21,21 +21,22 @@ import org.apache.lucene.analysis.LowerCaseFilter;
|
|||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.russian.morphology.informations.LuceneMorph;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
public class RussianMorphlogyAnalayzer extends Analyzer {
|
public class RussianMorphlogyAnalayzer extends Analyzer {
|
||||||
private SuffixHeuristic suffixHeuristic;
|
private LuceneMorph luceneMorph;
|
||||||
|
|
||||||
public RussianMorphlogyAnalayzer() throws IOException {
|
public RussianMorphlogyAnalayzer() throws IOException {
|
||||||
suffixHeuristic = new SuffixHeuristic();
|
luceneMorph = new LuceneMorph("sep.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
TokenStream result = new StandardTokenizer(reader);
|
TokenStream result = new StandardTokenizer(reader);
|
||||||
result = new StandardFilter(result);
|
result = new StandardFilter(result);
|
||||||
result = new LowerCaseFilter(result);
|
result = new LowerCaseFilter(result);
|
||||||
return new RussianMorphlogyFilter(result, suffixHeuristic);
|
return new RussianMorphlogyFilter(result, luceneMorph);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -19,30 +19,61 @@ package org.apache.lucene.russian.morphology.analayzer;
|
|||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.russian.morphology.informations.LuceneMorph;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
|
||||||
public class RussianMorphlogyFilter extends TokenFilter {
|
public class RussianMorphlogyFilter extends TokenFilter {
|
||||||
private SuffixHeuristic suffixHeuristic;
|
private LuceneMorph luceneMorph;
|
||||||
|
|
||||||
public RussianMorphlogyFilter(TokenStream tokenStream, SuffixHeuristic suffixHeuristic) {
|
public RussianMorphlogyFilter(TokenStream tokenStream, LuceneMorph luceneMorph) {
|
||||||
super(tokenStream);
|
super(tokenStream);
|
||||||
this.suffixHeuristic = suffixHeuristic;
|
this.luceneMorph = luceneMorph;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<String> stack = new ArrayList<String>();
|
||||||
|
private int index = 0;
|
||||||
|
private Token current = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the next token in the stream, or null at EOS.
|
||||||
|
*/
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public Token next(final Token reusableToken) throws IOException {
|
||||||
|
assert reusableToken != null;
|
||||||
|
while (index < stack.size()) { // pop from stack
|
||||||
|
Token nextToken = createToken(stack.get(index++), current, reusableToken);
|
||||||
|
if (nextToken != null) {
|
||||||
|
return nextToken;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Token nextToken = input.next(reusableToken);
|
Token nextToken = input.next(reusableToken);
|
||||||
if (nextToken == null || nextToken.term().length() == 0) return nextToken;
|
if (nextToken == null) return null; // EOS; iterator exhausted
|
||||||
String word = nextToken.term();
|
Character testC = nextToken.term().charAt(0);
|
||||||
Character testC = word.charAt(0);
|
|
||||||
if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC) {
|
if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC) {
|
||||||
return nextToken;
|
return nextToken;
|
||||||
}
|
}
|
||||||
Token current = (Token) nextToken.clone();
|
stack = luceneMorph.getMorhInfo(nextToken.term());
|
||||||
return createToken(suffixHeuristic.getCanonicalForm(word), current, reusableToken);
|
index = 0;
|
||||||
|
current = (Token) nextToken.clone();
|
||||||
|
nextToken = createToken(stack.get(index++), current, reusableToken);
|
||||||
|
return nextToken;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates and returns a token for the given synonym of the current input
|
||||||
|
* token; Override for custom (stateless or stateful) behavior, if desired.
|
||||||
|
*
|
||||||
|
* @param synonym a synonym for the current token's term
|
||||||
|
* @param current the current token from the underlying child stream
|
||||||
|
* @param reusableToken the token to reuse
|
||||||
|
* @return a new token, or null to indicate that the given synonym should be
|
||||||
|
* ignored
|
||||||
|
*/
|
||||||
protected Token createToken(String synonym, Token current, final Token reusableToken) {
|
protected Token createToken(String synonym, Token current, final Token reusableToken) {
|
||||||
reusableToken.reinit(current, synonym);
|
reusableToken.reinit(current, synonym);
|
||||||
reusableToken.setTermBuffer(synonym);
|
reusableToken.setTermBuffer(synonym);
|
||||||
|
@ -0,0 +1,59 @@
|
|||||||
|
package org.apache.lucene.russian.morphology.informations;
|
||||||
|
|
||||||
|
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
|
public class LuceneMorph extends Morph{
|
||||||
|
|
||||||
|
public LuceneMorph(String fileName) throws IOException {
|
||||||
|
super(fileName);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<String> getMorhInfo(String s) {
|
||||||
|
ArrayList<String> result = new ArrayList<String>();
|
||||||
|
int[] ints = RussianSuffixDecoderEncoder.encodeToArray(revertWord(s));
|
||||||
|
int ruleId = findRuleId(ints);
|
||||||
|
for (Heuristic h : rules[rulesId[ruleId]]) {
|
||||||
|
result.add(h.transofrmWord(s));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void readRules(BufferedReader bufferedReader) throws IOException {
|
||||||
|
String s;
|
||||||
|
Integer amount;
|
||||||
|
s = bufferedReader.readLine();
|
||||||
|
amount = Integer.valueOf(s);
|
||||||
|
rules = new Heuristic[amount][];
|
||||||
|
for (int i = 0; i < amount; i++) {
|
||||||
|
String s1 = bufferedReader.readLine();
|
||||||
|
Integer ruleLenght = Integer.valueOf(s1);
|
||||||
|
Heuristic[] heuristics = new Heuristic[ruleLenght];
|
||||||
|
for (int j = 0; j < ruleLenght; j++) {
|
||||||
|
heuristics[j] = new Heuristic(bufferedReader.readLine());
|
||||||
|
}
|
||||||
|
rules[i] = modeifyHeuristic(heuristics);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Heuristic[] modeifyHeuristic(Heuristic[] heuristics){
|
||||||
|
ArrayList<Heuristic> result = new ArrayList<Heuristic>();
|
||||||
|
for(Heuristic heuristic:heuristics){
|
||||||
|
boolean isAdded = true;
|
||||||
|
for(Heuristic ch:result){
|
||||||
|
isAdded = isAdded && !(ch.getActualNormalSuffix().equals(heuristic.getActualNormalSuffix()) && (ch.getActualSuffixLengh() == heuristic.getActualSuffixLengh()));
|
||||||
|
}
|
||||||
|
if(isAdded){
|
||||||
|
result.add(heuristic);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result.toArray(new Heuristic[result.size()]);
|
||||||
|
}
|
||||||
|
}
|
@ -26,10 +26,10 @@ import java.util.List;
|
|||||||
|
|
||||||
|
|
||||||
public class Morph {
|
public class Morph {
|
||||||
int[][] separators;
|
protected int[][] separators;
|
||||||
short[] rulesId;
|
protected short[] rulesId;
|
||||||
Heuristic[][] rules;
|
protected Heuristic[][] rules;
|
||||||
String[] grammaInfo;
|
protected String[] grammaInfo;
|
||||||
|
|
||||||
|
|
||||||
public Morph(String fileName) throws IOException {
|
public Morph(String fileName) throws IOException {
|
||||||
@ -64,13 +64,12 @@ public class Morph {
|
|||||||
int[] ints = RussianSuffixDecoderEncoder.encodeToArray(revertWord(s));
|
int[] ints = RussianSuffixDecoderEncoder.encodeToArray(revertWord(s));
|
||||||
int ruleId = findRuleId(ints);
|
int ruleId = findRuleId(ints);
|
||||||
for (Heuristic h : rules[rulesId[ruleId]]) {
|
for (Heuristic h : rules[rulesId[ruleId]]) {
|
||||||
System.out.println(h);
|
result.add(h.transofrmWord(s) + "|" + grammaInfo[h.getFormMorphInfo()]);
|
||||||
result.add(h.transofrmWord(s));
|
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
private int findRuleId(int[] ints) {
|
protected int findRuleId(int[] ints) {
|
||||||
int low = 0;
|
int low = 0;
|
||||||
int high = separators.length - 1;
|
int high = separators.length - 1;
|
||||||
int mid = 0;
|
int mid = 0;
|
||||||
@ -133,20 +132,30 @@ public class Morph {
|
|||||||
BufferedReader bufferedReader = new BufferedReader(new FileReader(fileName));
|
BufferedReader bufferedReader = new BufferedReader(new FileReader(fileName));
|
||||||
String s = bufferedReader.readLine();
|
String s = bufferedReader.readLine();
|
||||||
Integer amount = Integer.valueOf(s);
|
Integer amount = Integer.valueOf(s);
|
||||||
separators = new int[amount][];
|
|
||||||
|
readSeparators(bufferedReader, amount);
|
||||||
|
|
||||||
|
readRulesId(bufferedReader, amount);
|
||||||
|
|
||||||
|
readRules(bufferedReader);
|
||||||
|
readGrammaInfo(bufferedReader);
|
||||||
|
bufferedReader.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readGrammaInfo(BufferedReader bufferedReader) throws IOException {
|
||||||
|
String s;
|
||||||
|
Integer amount;
|
||||||
|
s = bufferedReader.readLine();
|
||||||
|
amount = Integer.valueOf(s);
|
||||||
|
grammaInfo = new String[amount];
|
||||||
for (int i = 0; i < amount; i++) {
|
for (int i = 0; i < amount; i++) {
|
||||||
String s1 = bufferedReader.readLine();
|
grammaInfo[i] = bufferedReader.readLine();
|
||||||
Integer wordLenght = Integer.valueOf(s1);
|
|
||||||
separators[i] = new int[wordLenght];
|
|
||||||
for (int j = 0; j < wordLenght; j++) {
|
|
||||||
separators[i][j] = Integer.valueOf(bufferedReader.readLine());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
rulesId = new short[amount];
|
|
||||||
for (int i = 0; i < amount; i++) {
|
protected void readRules(BufferedReader bufferedReader) throws IOException {
|
||||||
String s1 = bufferedReader.readLine();
|
String s;
|
||||||
rulesId[i] = Short.valueOf(s1);
|
Integer amount;
|
||||||
}
|
|
||||||
s = bufferedReader.readLine();
|
s = bufferedReader.readLine();
|
||||||
amount = Integer.valueOf(s);
|
amount = Integer.valueOf(s);
|
||||||
rules = new Heuristic[amount][];
|
rules = new Heuristic[amount][];
|
||||||
@ -158,16 +167,29 @@ public class Morph {
|
|||||||
rules[i][j] = new Heuristic(bufferedReader.readLine());
|
rules[i][j] = new Heuristic(bufferedReader.readLine());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
s = bufferedReader.readLine();
|
|
||||||
amount = Integer.valueOf(s);
|
|
||||||
grammaInfo = new String[amount];
|
|
||||||
for (int i = 0; i < amount; i++) {
|
|
||||||
grammaInfo[i] = bufferedReader.readLine();
|
|
||||||
}
|
|
||||||
bufferedReader.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private String revertWord(String s) {
|
private void readRulesId(BufferedReader bufferedReader, Integer amount) throws IOException {
|
||||||
|
rulesId = new short[amount];
|
||||||
|
for (int i = 0; i < amount; i++) {
|
||||||
|
String s1 = bufferedReader.readLine();
|
||||||
|
rulesId[i] = Short.valueOf(s1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException {
|
||||||
|
separators = new int[amount][];
|
||||||
|
for (int i = 0; i < amount; i++) {
|
||||||
|
String s1 = bufferedReader.readLine();
|
||||||
|
Integer wordLenght = Integer.valueOf(s1);
|
||||||
|
separators[i] = new int[wordLenght];
|
||||||
|
for (int j = 0; j < wordLenght; j++) {
|
||||||
|
separators[i][j] = Integer.valueOf(bufferedReader.readLine());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected String revertWord(String s) {
|
||||||
String result = "";
|
String result = "";
|
||||||
for (int i = 1; i <= s.length(); i++) {
|
for (int i = 1; i <= s.length(); i++) {
|
||||||
result += s.charAt(s.length() - i);
|
result += s.charAt(s.length() - i);
|
||||||
|
@ -17,37 +17,45 @@
|
|||||||
package org.apache.lucene.russian.morphology.analayzer;
|
package org.apache.lucene.russian.morphology.analayzer;
|
||||||
|
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.InputStream;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public class RussianMorphlogyAnalayzerTest {
|
public class RussianMorphlogyAnalayzerTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void shouldCorrectProccessText() throws IOException {
|
public void shouldCorrectProccessText() throws IOException {
|
||||||
// RussianMorphlogyAnalayzer morphlogyAnalayzer = new RussianMorphlogyAnalayzer();
|
RussianMorphlogyAnalayzer morphlogyAnalayzer = new RussianMorphlogyAnalayzer();
|
||||||
// InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt");
|
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt");
|
||||||
// BufferedReader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
BufferedReader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||||
//
|
|
||||||
// InputStream tokeStream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/token-of-russian-text.txt");
|
InputStream tokeStream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/token-of-russian-text.txt");
|
||||||
// BufferedReader tokenReader = new BufferedReader(new InputStreamReader(tokeStream, "UTF-8"));
|
BufferedReader tokenReader = new BufferedReader(new InputStreamReader(tokeStream, "UTF-8"));
|
||||||
//
|
|
||||||
// final Token reusableToken = new Token();
|
final Token reusableToken = new Token();
|
||||||
//
|
|
||||||
// Token nextToken;
|
Token nextToken;
|
||||||
//
|
|
||||||
//
|
|
||||||
// TokenStream in = morphlogyAnalayzer.tokenStream(null, reader);
|
TokenStream in = morphlogyAnalayzer.tokenStream(null, reader);
|
||||||
// for (; ;) {
|
for (; ;) {
|
||||||
// nextToken = in.next(reusableToken);
|
nextToken = in.next(reusableToken);
|
||||||
//
|
|
||||||
// if (nextToken == null) {
|
if (nextToken == null) {
|
||||||
// break;
|
break;
|
||||||
// }
|
}
|
||||||
//
|
|
||||||
|
System.out.println(nextToken.term());
|
||||||
// assertThat(nextToken.term(), equalTo(tokenReader.readLine().trim()));
|
// assertThat(nextToken.term(), equalTo(tokenReader.readLine().trim()));
|
||||||
//
|
|
||||||
// }
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5,4 +5,4 @@
|
|||||||
— Калушата подудонились! Калушата подудонились! Зюмо некузявые! Пуськи бятые!
|
— Калушата подудонились! Калушата подудонились! Зюмо некузявые! Пуськи бятые!
|
||||||
В условиях нарастающей пурги было сделано 4 успешных захода на посадку. "Все нормально, будем рекомендовать систему к внедрению".
|
В условиях нарастающей пурги было сделано 4 успешных захода на посадку. "Все нормально, будем рекомендовать систему к внедрению".
|
||||||
Рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются
|
Рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются
|
||||||
примерно один раз в две недели.
|
примерно один раз в две недели. вина твоя вина мне
|
Loading…
x
Reference in New Issue
Block a user