From 786ce92ae0ecf90693ab7ffbe55bdb668b5e61c7 Mon Sep 17 00:00:00 2001 From: "alexander.a.kuznetsov" Date: Fri, 4 Sep 2009 16:43:17 +0000 Subject: [PATCH] Adding new lucene model of analayser git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@48 d817d54c-26ab-11de-abc9-2f7d1455ff7a --- .../russian/morphology/HeuristicBuilder.java | 54 ------------- .../analayzer/RussianMorphlogyAnalayzer.java | 7 +- .../analayzer/RussianMorphlogyFilter.java | 47 ++++++++++-- .../morphology/informations/LuceneMorph.java | 59 ++++++++++++++ .../morphology/informations/Morph.java | 76 ++++++++++++------- .../RussianMorphlogyAnalayzerTest.java | 54 +++++++------ .../morphology/analayzer/russian-text.txt | 2 +- 7 files changed, 183 insertions(+), 116 deletions(-) create mode 100644 src/main/java/org/apache/lucene/russian/morphology/informations/LuceneMorph.java diff --git a/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java b/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java index b56ecd8..cb32470 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java +++ b/src/main/java/org/apache/lucene/russian/morphology/HeuristicBuilder.java @@ -38,59 +38,5 @@ public class HeuristicBuilder { dictonaryReader.proccess(statiticsCollector); statiticsCollector.saveHeuristic(); - -// StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read()); -// dictonaryReader.proccess(statiticsCollectors); -// Collection counterCollection = statiticsCollectors.getStatititics().values(); -// Object[] objects = counterCollection.toArray(); -// Arrays.sort(objects); -// System.out.println("Length " + objects.length + " ingored words " + statiticsCollectors.getIgnoredCount()); -// for (int i = 0; i < 10; i++) { -// System.out.println(objects[i]); -// } -// -// final HeuristicBySuffixLegth heuristic = new HeuristicBySuffixLegth(); -// for (int i = 0; i < objects.length; i++) { -// heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic()); -// } -// -// System.out.println("Single suffix " + heuristic.getSingleSuffixes().size()); -// System.out.println("diffiren morgh " + heuristic.getWordWithMorphology().size()); -// System.out.println("Ononims " + heuristic.getOnonyms().size()); -// final Map> map = heuristic.getUnkowns(); -// System.out.println("Unknow suffix " + map.size()); -// int cont = 0; -// for (Set st : map.values()) { -// -// if (cont > 50) break; -// if (st.size() < 3) { -// System.out.println(st); -// cont++; -// } -// } -// //final RussianSuffixDecoderEncoder decoderEncoder = new RussianSuffixDecoderEncoder(6); -// final AtomicLong c = new AtomicLong(0L); -// final AtomicLong all = new AtomicLong(0L); -// dictonaryReader.proccess( -// new WordProccessor() { -// public void proccess(WordCard wordCard) throws IOException { -// for (FlexiaModel fm : wordCard.getWordsFroms()) { -// String form = fm.create(wordCard.getBase()); -// if(form.startsWith("ïðèê") && form.endsWith("üÿ")) System.out.println(form); -// -// -// int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0; -// String formSuffix = form.substring(startSymbol); -// Long aLong = RussianSuffixDecoderEncoder.encode(formSuffix); -// all.incrementAndGet(); -// if (map.containsKey(aLong)) c.incrementAndGet(); -// } -// } -// } -// ); -// -// -// System.out.println("Ankown words " + all.longValue()); -// System.out.println("Ankown words " + c.longValue()); } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzer.java b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzer.java index 9859fe1..18e56a2 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzer.java +++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzer.java @@ -21,21 +21,22 @@ import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.russian.morphology.informations.LuceneMorph; import java.io.IOException; import java.io.Reader; public class RussianMorphlogyAnalayzer extends Analyzer { - private SuffixHeuristic suffixHeuristic; + private LuceneMorph luceneMorph; public RussianMorphlogyAnalayzer() throws IOException { - suffixHeuristic = new SuffixHeuristic(); + luceneMorph = new LuceneMorph("sep.txt"); } public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); - return new RussianMorphlogyFilter(result, suffixHeuristic); + return new RussianMorphlogyFilter(result, luceneMorph); } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java index 7772757..b494792 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java +++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java @@ -19,30 +19,61 @@ package org.apache.lucene.russian.morphology.analayzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.russian.morphology.informations.LuceneMorph; import java.io.IOException; +import java.util.List; +import java.util.ArrayList; public class RussianMorphlogyFilter extends TokenFilter { - private SuffixHeuristic suffixHeuristic; + private LuceneMorph luceneMorph; - public RussianMorphlogyFilter(TokenStream tokenStream, SuffixHeuristic suffixHeuristic) { + public RussianMorphlogyFilter(TokenStream tokenStream, LuceneMorph luceneMorph) { super(tokenStream); - this.suffixHeuristic = suffixHeuristic; + this.luceneMorph = luceneMorph; } + + private List stack = new ArrayList(); + private int index = 0; + private Token current = null; + + /** + * Returns the next token in the stream, or null at EOS. + */ public Token next(final Token reusableToken) throws IOException { + assert reusableToken != null; + while (index < stack.size()) { // pop from stack + Token nextToken = createToken(stack.get(index++), current, reusableToken); + if (nextToken != null) { + return nextToken; + } + } + Token nextToken = input.next(reusableToken); - if (nextToken == null || nextToken.term().length() == 0) return nextToken; - String word = nextToken.term(); - Character testC = word.charAt(0); + if (nextToken == null) return null; // EOS; iterator exhausted + Character testC = nextToken.term().charAt(0); if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC) { return nextToken; } - Token current = (Token) nextToken.clone(); - return createToken(suffixHeuristic.getCanonicalForm(word), current, reusableToken); + stack = luceneMorph.getMorhInfo(nextToken.term()); + index = 0; + current = (Token) nextToken.clone(); + nextToken = createToken(stack.get(index++), current, reusableToken); + return nextToken; } + /** + * Creates and returns a token for the given synonym of the current input + * token; Override for custom (stateless or stateful) behavior, if desired. + * + * @param synonym a synonym for the current token's term + * @param current the current token from the underlying child stream + * @param reusableToken the token to reuse + * @return a new token, or null to indicate that the given synonym should be + * ignored + */ protected Token createToken(String synonym, Token current, final Token reusableToken) { reusableToken.reinit(current, synonym); reusableToken.setTermBuffer(synonym); diff --git a/src/main/java/org/apache/lucene/russian/morphology/informations/LuceneMorph.java b/src/main/java/org/apache/lucene/russian/morphology/informations/LuceneMorph.java new file mode 100644 index 0000000..f2d0677 --- /dev/null +++ b/src/main/java/org/apache/lucene/russian/morphology/informations/LuceneMorph.java @@ -0,0 +1,59 @@ +package org.apache.lucene.russian.morphology.informations; + +import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; + +import java.io.IOException; +import java.io.BufferedReader; +import java.util.ArrayList; +import java.util.List; + + +public class LuceneMorph extends Morph{ + + public LuceneMorph(String fileName) throws IOException { + super(fileName); + } + + @Override + public List getMorhInfo(String s) { + ArrayList result = new ArrayList(); + int[] ints = RussianSuffixDecoderEncoder.encodeToArray(revertWord(s)); + int ruleId = findRuleId(ints); + for (Heuristic h : rules[rulesId[ruleId]]) { + result.add(h.transofrmWord(s)); + } + return result; + } + + protected void readRules(BufferedReader bufferedReader) throws IOException { + String s; + Integer amount; + s = bufferedReader.readLine(); + amount = Integer.valueOf(s); + rules = new Heuristic[amount][]; + for (int i = 0; i < amount; i++) { + String s1 = bufferedReader.readLine(); + Integer ruleLenght = Integer.valueOf(s1); + Heuristic[] heuristics = new Heuristic[ruleLenght]; + for (int j = 0; j < ruleLenght; j++) { + heuristics[j] = new Heuristic(bufferedReader.readLine()); + } + rules[i] = modeifyHeuristic(heuristics); + } + } + + + private Heuristic[] modeifyHeuristic(Heuristic[] heuristics){ + ArrayList result = new ArrayList(); + for(Heuristic heuristic:heuristics){ + boolean isAdded = true; + for(Heuristic ch:result){ + isAdded = isAdded && !(ch.getActualNormalSuffix().equals(heuristic.getActualNormalSuffix()) && (ch.getActualSuffixLengh() == heuristic.getActualSuffixLengh())); + } + if(isAdded){ + result.add(heuristic); + } + } + return result.toArray(new Heuristic[result.size()]); + } +} diff --git a/src/main/java/org/apache/lucene/russian/morphology/informations/Morph.java b/src/main/java/org/apache/lucene/russian/morphology/informations/Morph.java index f4b7376..52b1a36 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/informations/Morph.java +++ b/src/main/java/org/apache/lucene/russian/morphology/informations/Morph.java @@ -26,10 +26,10 @@ import java.util.List; public class Morph { - int[][] separators; - short[] rulesId; - Heuristic[][] rules; - String[] grammaInfo; + protected int[][] separators; + protected short[] rulesId; + protected Heuristic[][] rules; + protected String[] grammaInfo; public Morph(String fileName) throws IOException { @@ -64,13 +64,12 @@ public class Morph { int[] ints = RussianSuffixDecoderEncoder.encodeToArray(revertWord(s)); int ruleId = findRuleId(ints); for (Heuristic h : rules[rulesId[ruleId]]) { - System.out.println(h); - result.add(h.transofrmWord(s)); + result.add(h.transofrmWord(s) + "|" + grammaInfo[h.getFormMorphInfo()]); } return result; } - private int findRuleId(int[] ints) { + protected int findRuleId(int[] ints) { int low = 0; int high = separators.length - 1; int mid = 0; @@ -133,20 +132,30 @@ public class Morph { BufferedReader bufferedReader = new BufferedReader(new FileReader(fileName)); String s = bufferedReader.readLine(); Integer amount = Integer.valueOf(s); - separators = new int[amount][]; + + readSeparators(bufferedReader, amount); + + readRulesId(bufferedReader, amount); + + readRules(bufferedReader); + readGrammaInfo(bufferedReader); + bufferedReader.close(); + } + + private void readGrammaInfo(BufferedReader bufferedReader) throws IOException { + String s; + Integer amount; + s = bufferedReader.readLine(); + amount = Integer.valueOf(s); + grammaInfo = new String[amount]; for (int i = 0; i < amount; i++) { - String s1 = bufferedReader.readLine(); - Integer wordLenght = Integer.valueOf(s1); - separators[i] = new int[wordLenght]; - for (int j = 0; j < wordLenght; j++) { - separators[i][j] = Integer.valueOf(bufferedReader.readLine()); - } - } - rulesId = new short[amount]; - for (int i = 0; i < amount; i++) { - String s1 = bufferedReader.readLine(); - rulesId[i] = Short.valueOf(s1); + grammaInfo[i] = bufferedReader.readLine(); } + } + + protected void readRules(BufferedReader bufferedReader) throws IOException { + String s; + Integer amount; s = bufferedReader.readLine(); amount = Integer.valueOf(s); rules = new Heuristic[amount][]; @@ -158,16 +167,29 @@ public class Morph { rules[i][j] = new Heuristic(bufferedReader.readLine()); } } - s = bufferedReader.readLine(); - amount = Integer.valueOf(s); - grammaInfo = new String[amount]; - for (int i = 0; i < amount; i++) { - grammaInfo[i] = bufferedReader.readLine(); - } - bufferedReader.close(); } - private String revertWord(String s) { + private void readRulesId(BufferedReader bufferedReader, Integer amount) throws IOException { + rulesId = new short[amount]; + for (int i = 0; i < amount; i++) { + String s1 = bufferedReader.readLine(); + rulesId[i] = Short.valueOf(s1); + } + } + + private void readSeparators(BufferedReader bufferedReader, Integer amount) throws IOException { + separators = new int[amount][]; + for (int i = 0; i < amount; i++) { + String s1 = bufferedReader.readLine(); + Integer wordLenght = Integer.valueOf(s1); + separators[i] = new int[wordLenght]; + for (int j = 0; j < wordLenght; j++) { + separators[i][j] = Integer.valueOf(bufferedReader.readLine()); + } + } + } + + protected String revertWord(String s) { String result = ""; for (int i = 1; i <= s.length(); i++) { result += s.charAt(s.length() - i); diff --git a/src/test/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzerTest.java b/src/test/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzerTest.java index 348a9b4..d897c2a 100644 --- a/src/test/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzerTest.java +++ b/src/test/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzerTest.java @@ -17,37 +17,45 @@ package org.apache.lucene.russian.morphology.analayzer; import org.junit.Test; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Token; import java.io.IOException; +import java.io.InputStreamReader; +import java.io.BufferedReader; +import java.io.InputStream; + + public class RussianMorphlogyAnalayzerTest { @Test public void shouldCorrectProccessText() throws IOException { -// RussianMorphlogyAnalayzer morphlogyAnalayzer = new RussianMorphlogyAnalayzer(); -// InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt"); -// BufferedReader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); -// -// InputStream tokeStream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/token-of-russian-text.txt"); -// BufferedReader tokenReader = new BufferedReader(new InputStreamReader(tokeStream, "UTF-8")); -// -// final Token reusableToken = new Token(); -// -// Token nextToken; -// -// -// TokenStream in = morphlogyAnalayzer.tokenStream(null, reader); -// for (; ;) { -// nextToken = in.next(reusableToken); -// -// if (nextToken == null) { -// break; -// } -// -// assertThat(nextToken.term(), equalTo(tokenReader.readLine().trim())); -// -// } + RussianMorphlogyAnalayzer morphlogyAnalayzer = new RussianMorphlogyAnalayzer(); + InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt"); + BufferedReader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); + + InputStream tokeStream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/token-of-russian-text.txt"); + BufferedReader tokenReader = new BufferedReader(new InputStreamReader(tokeStream, "UTF-8")); + + final Token reusableToken = new Token(); + + Token nextToken; + + + TokenStream in = morphlogyAnalayzer.tokenStream(null, reader); + for (; ;) { + nextToken = in.next(reusableToken); + + if (nextToken == null) { + break; + } + + System.out.println(nextToken.term()); + // assertThat(nextToken.term(), equalTo(tokenReader.readLine().trim())); + + } } } diff --git a/src/test/resources/org/apache/lucene/russian/morphology/analayzer/russian-text.txt b/src/test/resources/org/apache/lucene/russian/morphology/analayzer/russian-text.txt index 128d153..c7212eb 100644 --- a/src/test/resources/org/apache/lucene/russian/morphology/analayzer/russian-text.txt +++ b/src/test/resources/org/apache/lucene/russian/morphology/analayzer/russian-text.txt @@ -5,4 +5,4 @@ — Калушата подудонилиÑÑŒ! Калушата подудонилиÑÑŒ! Зюмо некузÑвые! ПуÑьки бÑтые! Ð’ уÑловиÑÑ… нараÑтающей пурги было Ñделано 4 уÑпешных захода на поÑадку. "Ð’Ñе нормально, будем рекомендовать ÑиÑтему к внедрению". РейÑÑ‹ из Кейптауна (ЮÐР) на Ñтанцию "ÐоволазаревÑкаÑ" (Ðнтарктида) ÑовершаютÑÑ -примерно один раз в две недели. \ No newline at end of file +примерно один раз в две недели. вина Ñ‚Ð²Ð¾Ñ Ð²Ð¸Ð½Ð° мне \ No newline at end of file