Adding context stats

Working on morph ambiguously resolver
2015-07-14 16:26:22 +04:00 · 2015-06-22 22:42:15 +03:00
11 changed files with 447 additions and 61 deletions
@@ -0,0 +1,36 @@
 <?xml version="1.0"?>
 <project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
         xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>org.apache.lucene.morphology</groupId>
        <artifactId>morphology</artifactId>
        <version>1.2-SNAPSHOT</version>
    </parent>
    <groupId>org.apache.lucene.morphology</groupId>
    <artifactId>context</artifactId>
    <version>1.0-SNAPSHOT</version>
    <name>context</name>
    <url>http://maven.apache.org</url>
    <dependencies>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.8.2</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.lucene.morphology</groupId>
            <artifactId>russian</artifactId>
            <version>1.2-SNAPSHOT</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.lucene.morphology</groupId>
            <artifactId>english</artifactId>
            <version>1.2-SNAPSHOT</version>
            <scope>test</scope>
        </dependency>
    </dependencies>
 </project>
@@ -0,0 +1,52 @@
 /**
 * Copyright 2015 Alexander Kuznetsov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.morphology.context;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.util.ArrayList;
 import java.util.LinkedList;
 import java.util.List;
 public class CalculateContextItem {
    public List<ContextItem> createContextItems(String text) throws IOException {
        Analyzer statAnalyzer = new StatAnalyzer();
        InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год? - и что я жду тебя, где вино".getBytes()), "UTF-8");
 //        new RussianMorphology();
        TokenStream tokenStream = statAnalyzer.tokenStream(null, reader);
        tokenStream.reset();
        List<List<String>> listedLink =  new LinkedList<>();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
            PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
        }
        return null;
    }
 }
@@ -0,0 +1,80 @@
 /**
 * Copyright 2015 Alexander Kuznetsov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.morphology.context;
 import java.util.Arrays;
 public class ContextItem implements Comparable<ContextItem> {
    String[][] morphInfo;
    public ContextItem(String[][] morphInfo) {
        this.morphInfo = morphInfo;
    }
    public String[][] getMorphInfo() {
        return morphInfo;
    }
    public void setMorphInfo(String[][] morphInfo) {
        this.morphInfo = morphInfo;
    }
    public int hashCode() {
        int h = 0;
        for (String[] m : morphInfo) {
            for (String s : m) {
                h = 31 * h + s.hashCode();
            }
        }
        return h;
    }
    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;
        ContextItem that = (ContextItem) o;
        if (that.morphInfo.length != this.morphInfo.length) {
            return false;
        }
        for (int i = 0; i < morphInfo.length; i++) {
            if (!Arrays.equals(morphInfo[i], that.morphInfo[i])) {
                return false;
            }
        }
        return true;
    }
    @Override
    public int compareTo(ContextItem o) {
        int i = o.morphInfo.length - morphInfo.length;
        if (i != 0) return i;
        for (int j = 0; j < morphInfo.length; j++) {
            i = o.morphInfo[j].length - morphInfo[j].length;
            if (i != 0) return i;
            for (int k = 0; k < morphInfo[j].length; k++) {
                i = morphInfo[j][k].compareTo(o.morphInfo[j][k]);
                if (i != 0) return i;
            }
        }
        return 0;
    }
 }
@@ -0,0 +1,37 @@
 /**
 * Copyright 2015 Alexander Kuznetsov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.morphology.context;
 public class ContextStats {
    String[] morphInfo;
    double prob;
    public String[] getMorphInfo() {
        return morphInfo;
    }
    public void setMorphInfo(String[] morphInfo) {
        this.morphInfo = morphInfo;
    }
    public double getProb() {
        return prob;
    }
    public void setProb(double prob) {
        this.prob = prob;
    }
 }
@@ -0,0 +1,21 @@
 /**
 * Copyright 2015 Alexander Kuznetsov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.morphology.context;
 public class ProbClalucator {
 }
@@ -0,0 +1,116 @@
 /**
 * Copyright 2015 Alexander Kuznetsov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.morphology.context;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.util.AttributeFactory;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.LinkedList;
 import java.util.Set;
 public class SimpleTokenizer extends Tokenizer {
    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
    LinkedList<String> terms;
    public final static Set<Character> SEPARATION_LETTERS = new HashSet<>(Arrays.asList(' ', '(', ')', ',', '|', '\t',
            '\n', '"', ':', '!', '?', ',', ';', '•'));
    public final static Set<Character> MEANING_CHARS = new HashSet<>(Arrays.asList('(', ')', ',', '|',
            '"', ':', '!', '?', ',', ';', '•', '.'));
    public SimpleTokenizer() {
    }
    public SimpleTokenizer(AttributeFactory factory) {
        super(factory);
    }
    @Override
    final public boolean incrementToken() throws IOException {
        if (terms == null) {
            createTeams();
        }
        if (terms.size() > 0) {
            String str = terms.poll();
            termAtt.setEmpty();
            termAtt.append(str);
            posAtt.setPositionIncrement(1);
            return true;
        }
        return false;
    }
    private void createTeams() throws IOException {
        terms = new LinkedList<>();
        BufferedReader br = new BufferedReader(input);
        StringBuilder sb = new StringBuilder();
        String s = "";
        while ((s = br.readLine()) != null) {
            sb.append(s).append(" ");
        }
        s = sb.toString();
        CharTermAttributeImpl currentTerm = new CharTermAttributeImpl();
        for (int i = 0; i < s.length(); i++) {
            if (checkIsCharSepartor(s, i)) {
                if (checkIsCharHasMeaning(s, i)) {
                    terms.add(s.substring(i, i + 1));
                }
                String term = currentTerm.toString();
                currentTerm.clear();
                if (term.length() > 0) {
                    terms.add(term);
                }
            } else {
                currentTerm.append(s.charAt(i));
            }
        }
    }
    private boolean checkIsCharHasMeaning(String s, int i) {
        return MEANING_CHARS.contains(s.charAt(i));
    }
    private boolean checkIsCharSepartor(String s, int i) {
        char c = s.charAt(i);
        if (SEPARATION_LETTERS.contains(c)) {
            return true;
        }
        if ('.' == c
                && s.length() > i + 1
                && SEPARATION_LETTERS.contains(s.charAt(i + 1))) {
            return true;
        }
        return false;
    }
    @Override
    public void reset() throws IOException {
        this.terms = null;
        super.reset();
    }
 }
@@ -0,0 +1,34 @@
 package org.apache.lucene.morphology.context;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.standard.StandardFilter;
 import java.io.IOException;
 import java.io.Reader;
 /**
 * Created by akuznetsov on 6/24/15.
 */
 public class StatAnalyzer extends Analyzer {
    @Override
    protected TokenStreamComponents createComponents(String s) {
        SimpleTokenizer src = new SimpleTokenizer();
        TokenFilter filter = new StandardFilter(src);
        filter = new LowerCaseFilter(filter);
        return new TokenStreamComponents(src, filter) {
            @Override
            protected void setReader(final Reader reader) throws IOException {
                super.setReader(reader);
            }
        };
    }
 }
@@ -0,0 +1,32 @@
 package org.apache.lucene.morphology.context;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.junit.Test;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
 public class SimpleTokenizerTest {
    @Test
    public void testSimpleTokenizer() throws IOException {
        Analyzer statAnalyzer = new StatAnalyzer();
        InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год? - и что я жду тебя, где вино".getBytes()), "UTF-8");
        TokenStream tokenStream = statAnalyzer.tokenStream(null, reader);
        tokenStream.reset();
        boolean wordSeen = false;
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
            PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
            System.out.println(charTerm.toString());
        }
    }
 }
@@ -1,4 +1,4 @@
-Copyright 2009 Alexander Kuznetsov 
+Copyright ${project.inceptionYear} ${owner}
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -51,25 +51,6 @@ public class MorphologyAnalyzer extends Analyzer {
    protected TokenStreamComponents createComponents(String s) {
        StandardTokenizer src = new StandardTokenizer();
        final PayloadEncoder encoder = new PayloadEncoder() {
            @Override
            public BytesRef encode(char[] buffer) {
                final Float payload = Float.valueOf(new String(buffer));
                System.out.println(payload);
                final byte[] bytes = PayloadHelper.encodeFloat(payload);
                return new BytesRef(bytes, 0, bytes.length);
            }
            @Override
            public BytesRef encode(char[] buffer, int offset, int length) {
                final Float payload = Float.valueOf(new String(buffer, offset, length));
                System.out.println(payload);
                final byte[] bytes = PayloadHelper.encodeFloat(payload);
                return new BytesRef(bytes, 0, bytes.length);
            }
        };
        TokenFilter filter = new StandardFilter(src);
        filter = new LowerCaseFilter(filter);
        filter = new MorphologyFilter(filter, luceneMorph);
@@ -1,5 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>org.apache.lucene.morphology</groupId>
    <artifactId>morphology</artifactId>
@@ -64,20 +65,8 @@
            </snapshots>
        </repository>
    </repositories>
-    
+
-    <pluginRepositories>
+
        <pluginRepository>
            <id>mc-release</id>
            <name>maven-license-plugin repository of releases</name>
            <url>http://mc-repo.googlecode.com/svn/maven2/releases</url>
            <snapshots>
                <enabled>false</enabled>
            </snapshots>
            <releases>
                <enabled>true</enabled>
            </releases>
        </pluginRepository>
    </pluginRepositories>
    <build>
        <plugins>
            <plugin>
@@ -94,34 +83,41 @@
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
-                    <source>1.5</source>
+                    <source>1.7</source>
-                    <target>1.5</target>
+                    <target>1.7</target>
                </configuration>
            </plugin>
-            <plugin>                <!--                 usage: http://code.google.com/p/maven-license-plugin/wiki/HowTo                -->
+            <!--<plugin>                &lt;!&ndash;                 usage: http://code.google.com/p/maven-license-plugin/wiki/HowTo                &ndash;&gt;-->
-                <artifactId>maven-license-plugin</artifactId>
+
-                <groupId>com.mathieucarbou.mojo</groupId>
+                <!--<groupId>com.mycila</groupId>-->
-                <configuration>
+                <!--<artifactId>license-maven-plugin</artifactId>-->
-                    <basedir>${project.parent.basedir}</basedir>
+                <!--<version>2.11</version>-->
-                    <header>etc/header.txt</header>
+
-                    <excludes>
+                <!--<configuration>-->
-                        <exclude>**/*.txt</exclude>
+                    <!--<properties>-->
-                        <exclude>**/*.info</exclude>
+                        <!--<owner>Alexander Kuznetsov</owner>-->
-                        <exclude>**/pom.xml</exclude>
+                        <!--&lt;!&ndash;<email>mathieu.carbou@gmail.com</email>&ndash;&gt;-->
-                    </excludes>
+                    <!--</properties>-->
-                    <includes>
+                    <!--<basedir>${project.parent.basedir}</basedir>-->
-                        <include>**/src/**</include>
+                    <!--<header>etc/header.txt</header>-->
-                    </includes>
+                    <!--<excludes>-->
-                </configuration>
+                        <!--<exclude>**/*.txt</exclude>-->
-                <executions>
+                        <!--<exclude>**/*.info</exclude>-->
-                    <execution>
+                        <!--<exclude>**/pom.xml</exclude>-->
-                        <phase>test</phase>
+                    <!--</excludes>-->
-                        <goals>
+                    <!--<includes>-->
-                            <goal>check</goal>
+                        <!--<include>**/src/**</include>-->
-                        </goals>
+                    <!--</includes>-->
-                    </execution>
+                <!--</configuration>-->
-                </executions>
+                <!--<executions>-->
-            </plugin>
+                    <!--<execution>-->
                        <!--<phase>test</phase>-->
                        <!--<goals>-->
                            <!--<goal>check</goal>-->
                        <!--</goals>-->
                    <!--</execution>-->
                <!--</executions>-->
            <!--</plugin>-->
        </plugins>
    </build>
    <profiles>
@@ -160,5 +156,6 @@
        <module>dictionary-reader</module>
        <module>russian</module>
        <module>english</module>
        <module>context</module>
    </modules>
 </project>
Author	SHA1	Message	Date
Alexander Kuznetsov	f095cbe7c0	Adding context stats	2015-07-14 16:26:22 +04:00
Alexander Kuznetsov	3b2e48821a	Working on morph ambiguously resolver	2015-06-22 22:42:15 +03:00