From 3b2e48821a5aaee562b867b204b1dcfa6b7a5450 Mon Sep 17 00:00:00 2001 From: Alexander Kuznetsov Date: Mon, 22 Jun 2015 22:41:21 +0300 Subject: [PATCH] Working on morph ambiguously resolver --- context/pom.xml | 23 ++++ .../context/CalculateContextItem.java | 7 ++ .../morphology/context/ContextItem.java | 29 +++++ .../morphology/context/ContextStats.java | 25 +++++ .../morphology/context/ProbClalucator.java | 9 ++ .../morphology/context/SimpleTokenizer.java | 104 ++++++++++++++++++ pom.xml | 7 +- 7 files changed, 201 insertions(+), 3 deletions(-) create mode 100644 context/pom.xml create mode 100644 context/src/main/java/org/apache/lucene/morphology/context/CalculateContextItem.java create mode 100644 context/src/main/java/org/apache/lucene/morphology/context/ContextItem.java create mode 100644 context/src/main/java/org/apache/lucene/morphology/context/ContextStats.java create mode 100644 context/src/main/java/org/apache/lucene/morphology/context/ProbClalucator.java create mode 100644 context/src/main/java/org/apache/lucene/morphology/context/SimpleTokenizer.java diff --git a/context/pom.xml b/context/pom.xml new file mode 100644 index 0000000..630d249 --- /dev/null +++ b/context/pom.xml @@ -0,0 +1,23 @@ + + + 4.0.0 + + org.apache.lucene.morphology + morphology + 1.2-SNAPSHOT + + org.apache.lucene.morphology + context + 1.0-SNAPSHOT + context + http://maven.apache.org + + + junit + junit + 3.8.1 + test + + + diff --git a/context/src/main/java/org/apache/lucene/morphology/context/CalculateContextItem.java b/context/src/main/java/org/apache/lucene/morphology/context/CalculateContextItem.java new file mode 100644 index 0000000..7d22d0a --- /dev/null +++ b/context/src/main/java/org/apache/lucene/morphology/context/CalculateContextItem.java @@ -0,0 +1,7 @@ +package org.apache.lucene.morphology.context; + +/** + * Created by alexander on 16.06.15. + */ +public class CalculateContextItem { +} diff --git a/context/src/main/java/org/apache/lucene/morphology/context/ContextItem.java b/context/src/main/java/org/apache/lucene/morphology/context/ContextItem.java new file mode 100644 index 0000000..f95fde9 --- /dev/null +++ b/context/src/main/java/org/apache/lucene/morphology/context/ContextItem.java @@ -0,0 +1,29 @@ +package org.apache.lucene.morphology.context; + +/** + * Created by alexander on 16.06.15. + */ +public class ContextItem { + String[][] morphInfo; + long count = 0; + + public ContextItem(String[][] morphInfo) { + this.morphInfo = morphInfo; + } + + public String[][] getMorphInfo() { + return morphInfo; + } + + public void setMorphInfo(String[][] morphInfo) { + this.morphInfo = morphInfo; + } + + public long getCount() { + return count; + } + + public void setCount(long count) { + this.count = count; + } +} diff --git a/context/src/main/java/org/apache/lucene/morphology/context/ContextStats.java b/context/src/main/java/org/apache/lucene/morphology/context/ContextStats.java new file mode 100644 index 0000000..f19d022 --- /dev/null +++ b/context/src/main/java/org/apache/lucene/morphology/context/ContextStats.java @@ -0,0 +1,25 @@ +package org.apache.lucene.morphology.context; + +/** + * Created by alexander on 16.06.15. + */ +public class ContextStats { + String[] morphInfo; + double prob; + + public String[] getMorphInfo() { + return morphInfo; + } + + public void setMorphInfo(String[] morphInfo) { + this.morphInfo = morphInfo; + } + + public double getProb() { + return prob; + } + + public void setProb(double prob) { + this.prob = prob; + } +} diff --git a/context/src/main/java/org/apache/lucene/morphology/context/ProbClalucator.java b/context/src/main/java/org/apache/lucene/morphology/context/ProbClalucator.java new file mode 100644 index 0000000..c3a0560 --- /dev/null +++ b/context/src/main/java/org/apache/lucene/morphology/context/ProbClalucator.java @@ -0,0 +1,9 @@ +package org.apache.lucene.morphology.context; + +/** + * Created by alexander on 16.06.15. + */ +public class ProbClalucator { + + +} diff --git a/context/src/main/java/org/apache/lucene/morphology/context/SimpleTokenizer.java b/context/src/main/java/org/apache/lucene/morphology/context/SimpleTokenizer.java new file mode 100644 index 0000000..385a361 --- /dev/null +++ b/context/src/main/java/org/apache/lucene/morphology/context/SimpleTokenizer.java @@ -0,0 +1,104 @@ +package org.apache.lucene.morphology.context; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.util.AttributeFactory; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.Set; + +/** + * Created by alexander on 16.06.15. + */ +public class SimpleTokenizer extends Tokenizer { + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class); + LinkedList terms; + + public final static Set SEPARATION_LETTERS = new HashSet<>(Arrays.asList(' ', '(', ')', ',', '|', '\t', + '\n', '"', ':', '!', '?', ',', ';', '•')); + + public final static Set MEANING_CHARS = new HashSet<>(Arrays.asList('(', ')', ',', '|', + '"', ':', '!', '?', ',', ';', '•', '.')); + + public SimpleTokenizer() { + } + + public SimpleTokenizer(AttributeFactory factory) { + super(factory); + } + + @Override + final public boolean incrementToken() throws IOException { + if (terms == null) { + createTeams(); + } + if (terms.size() > 0) { + String str = terms.poll(); + termAtt.setEmpty(); + termAtt.append(str); + posAtt.setPositionIncrement(1); + return true; + } + return false; + } + + private void createTeams() throws IOException { + terms = new LinkedList<>(); + + BufferedReader br = new BufferedReader(input); + StringBuilder sb = new StringBuilder(); + String s = ""; + while ((s = br.readLine()) != null) { + sb.append(s).append(" "); + } + + s = sb.toString(); + CharTermAttributeImpl currentTerm = new CharTermAttributeImpl(); + for (int i = 0; i < s.length(); i++) { + if (checkIsCharSepartor(s, i)) { + if (checkIsCharHasMeaning(s, i)) { + terms.add(s.substring(i, i + 1)); + } + String term = currentTerm.toString(); + currentTerm.clear(); + if (term.length() > 0) { + terms.add(term); + } + } else { + currentTerm.append(s.charAt(i)); + } + } + } + + private boolean checkIsCharHasMeaning(String s, int i) { + return MEANING_CHARS.contains(s.charAt(i)); + } + + private boolean checkIsCharSepartor(String s, int i) { + char c = s.charAt(i); + if (SEPARATION_LETTERS.contains(c)) { + return true; + } + if ('.' == c + && s.length() > i + 1 + && SEPARATION_LETTERS.contains(s.charAt(i + 1))) { + return true; + } + return false; + } + + @Override + public void reset() throws IOException { + this.terms = null; + super.reset(); + } + +} \ No newline at end of file diff --git a/pom.xml b/pom.xml index e6f2f0f..8709386 100644 --- a/pom.xml +++ b/pom.xml @@ -94,8 +94,8 @@ org.apache.maven.plugins maven-compiler-plugin - 1.5 - 1.5 + 1.7 + 1.7 @@ -160,5 +160,6 @@ dictionary-reader russian english - + context + \ No newline at end of file