Compare commits

...

2 Commits

Author SHA1 Message Date
Alexander Kuznetsov
f095cbe7c0 Adding context stats 2015-07-14 16:26:22 +04:00
Alexander Kuznetsov
3b2e48821a Working on morph ambiguously resolver 2015-06-22 22:42:15 +03:00
11 changed files with 447 additions and 61 deletions

36
context/pom.xml Normal file
View File

@ -0,0 +1,36 @@
<?xml version="1.0"?>
<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.lucene.morphology</groupId>
<artifactId>morphology</artifactId>
<version>1.2-SNAPSHOT</version>
</parent>
<groupId>org.apache.lucene.morphology</groupId>
<artifactId>context</artifactId>
<version>1.0-SNAPSHOT</version>
<name>context</name>
<url>http://maven.apache.org</url>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.8.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.lucene.morphology</groupId>
<artifactId>russian</artifactId>
<version>1.2-SNAPSHOT</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.lucene.morphology</groupId>
<artifactId>english</artifactId>
<version>1.2-SNAPSHOT</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,52 @@
/**
* Copyright 2015 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.context;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
public class CalculateContextItem {
public List<ContextItem> createContextItems(String text) throws IOException {
Analyzer statAnalyzer = new StatAnalyzer();
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год? - и что я жду тебя, где вино".getBytes()), "UTF-8");
// new RussianMorphology();
TokenStream tokenStream = statAnalyzer.tokenStream(null, reader);
tokenStream.reset();
List<List<String>> listedLink = new LinkedList<>();
while (tokenStream.incrementToken()) {
CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
}
return null;
}
}

View File

@ -0,0 +1,80 @@
/**
* Copyright 2015 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.context;
import java.util.Arrays;
public class ContextItem implements Comparable<ContextItem> {
String[][] morphInfo;
public ContextItem(String[][] morphInfo) {
this.morphInfo = morphInfo;
}
public String[][] getMorphInfo() {
return morphInfo;
}
public void setMorphInfo(String[][] morphInfo) {
this.morphInfo = morphInfo;
}
public int hashCode() {
int h = 0;
for (String[] m : morphInfo) {
for (String s : m) {
h = 31 * h + s.hashCode();
}
}
return h;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
ContextItem that = (ContextItem) o;
if (that.morphInfo.length != this.morphInfo.length) {
return false;
}
for (int i = 0; i < morphInfo.length; i++) {
if (!Arrays.equals(morphInfo[i], that.morphInfo[i])) {
return false;
}
}
return true;
}
@Override
public int compareTo(ContextItem o) {
int i = o.morphInfo.length - morphInfo.length;
if (i != 0) return i;
for (int j = 0; j < morphInfo.length; j++) {
i = o.morphInfo[j].length - morphInfo[j].length;
if (i != 0) return i;
for (int k = 0; k < morphInfo[j].length; k++) {
i = morphInfo[j][k].compareTo(o.morphInfo[j][k]);
if (i != 0) return i;
}
}
return 0;
}
}

View File

@ -0,0 +1,37 @@
/**
* Copyright 2015 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.context;
public class ContextStats {
String[] morphInfo;
double prob;
public String[] getMorphInfo() {
return morphInfo;
}
public void setMorphInfo(String[] morphInfo) {
this.morphInfo = morphInfo;
}
public double getProb() {
return prob;
}
public void setProb(double prob) {
this.prob = prob;
}
}

View File

@ -0,0 +1,21 @@
/**
* Copyright 2015 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.context;
public class ProbClalucator {
}

View File

@ -0,0 +1,116 @@
/**
* Copyright 2015 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.context;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeFactory;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Set;
public class SimpleTokenizer extends Tokenizer {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
LinkedList<String> terms;
public final static Set<Character> SEPARATION_LETTERS = new HashSet<>(Arrays.asList(' ', '(', ')', ',', '|', '\t',
'\n', '"', ':', '!', '?', ',', ';', '•'));
public final static Set<Character> MEANING_CHARS = new HashSet<>(Arrays.asList('(', ')', ',', '|',
'"', ':', '!', '?', ',', ';', '•', '.'));
public SimpleTokenizer() {
}
public SimpleTokenizer(AttributeFactory factory) {
super(factory);
}
@Override
final public boolean incrementToken() throws IOException {
if (terms == null) {
createTeams();
}
if (terms.size() > 0) {
String str = terms.poll();
termAtt.setEmpty();
termAtt.append(str);
posAtt.setPositionIncrement(1);
return true;
}
return false;
}
private void createTeams() throws IOException {
terms = new LinkedList<>();
BufferedReader br = new BufferedReader(input);
StringBuilder sb = new StringBuilder();
String s = "";
while ((s = br.readLine()) != null) {
sb.append(s).append(" ");
}
s = sb.toString();
CharTermAttributeImpl currentTerm = new CharTermAttributeImpl();
for (int i = 0; i < s.length(); i++) {
if (checkIsCharSepartor(s, i)) {
if (checkIsCharHasMeaning(s, i)) {
terms.add(s.substring(i, i + 1));
}
String term = currentTerm.toString();
currentTerm.clear();
if (term.length() > 0) {
terms.add(term);
}
} else {
currentTerm.append(s.charAt(i));
}
}
}
private boolean checkIsCharHasMeaning(String s, int i) {
return MEANING_CHARS.contains(s.charAt(i));
}
private boolean checkIsCharSepartor(String s, int i) {
char c = s.charAt(i);
if (SEPARATION_LETTERS.contains(c)) {
return true;
}
if ('.' == c
&& s.length() > i + 1
&& SEPARATION_LETTERS.contains(s.charAt(i + 1))) {
return true;
}
return false;
}
@Override
public void reset() throws IOException {
this.terms = null;
super.reset();
}
}

View File

@ -0,0 +1,34 @@
package org.apache.lucene.morphology.context;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import java.io.IOException;
import java.io.Reader;
/**
* Created by akuznetsov on 6/24/15.
*/
public class StatAnalyzer extends Analyzer {
@Override
protected TokenStreamComponents createComponents(String s) {
SimpleTokenizer src = new SimpleTokenizer();
TokenFilter filter = new StandardFilter(src);
filter = new LowerCaseFilter(filter);
return new TokenStreamComponents(src, filter) {
@Override
protected void setReader(final Reader reader) throws IOException {
super.setReader(reader);
}
};
}
}

View File

@ -0,0 +1,32 @@
package org.apache.lucene.morphology.context;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.junit.Test;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
public class SimpleTokenizerTest {
@Test
public void testSimpleTokenizer() throws IOException {
Analyzer statAnalyzer = new StatAnalyzer();
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год? - и что я жду тебя, где вино".getBytes()), "UTF-8");
TokenStream tokenStream = statAnalyzer.tokenStream(null, reader);
tokenStream.reset();
boolean wordSeen = false;
while (tokenStream.incrementToken()) {
CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
System.out.println(charTerm.toString());
}
}
}

View File

@ -1,4 +1,4 @@
Copyright 2009 Alexander Kuznetsov Copyright ${project.inceptionYear} ${owner}
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.

View File

@ -51,25 +51,6 @@ public class MorphologyAnalyzer extends Analyzer {
protected TokenStreamComponents createComponents(String s) { protected TokenStreamComponents createComponents(String s) {
StandardTokenizer src = new StandardTokenizer(); StandardTokenizer src = new StandardTokenizer();
final PayloadEncoder encoder = new PayloadEncoder() {
@Override
public BytesRef encode(char[] buffer) {
final Float payload = Float.valueOf(new String(buffer));
System.out.println(payload);
final byte[] bytes = PayloadHelper.encodeFloat(payload);
return new BytesRef(bytes, 0, bytes.length);
}
@Override
public BytesRef encode(char[] buffer, int offset, int length) {
final Float payload = Float.valueOf(new String(buffer, offset, length));
System.out.println(payload);
final byte[] bytes = PayloadHelper.encodeFloat(payload);
return new BytesRef(bytes, 0, bytes.length);
}
};
TokenFilter filter = new StandardFilter(src); TokenFilter filter = new StandardFilter(src);
filter = new LowerCaseFilter(filter); filter = new LowerCaseFilter(filter);
filter = new MorphologyFilter(filter, luceneMorph); filter = new MorphologyFilter(filter, luceneMorph);

79
pom.xml
View File

@ -1,5 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>org.apache.lucene.morphology</groupId> <groupId>org.apache.lucene.morphology</groupId>
<artifactId>morphology</artifactId> <artifactId>morphology</artifactId>
@ -64,20 +65,8 @@
</snapshots> </snapshots>
</repository> </repository>
</repositories> </repositories>
<pluginRepositories>
<pluginRepository>
<id>mc-release</id>
<name>maven-license-plugin repository of releases</name>
<url>http://mc-repo.googlecode.com/svn/maven2/releases</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
<releases>
<enabled>true</enabled>
</releases>
</pluginRepository>
</pluginRepositories>
<build> <build>
<plugins> <plugins>
<plugin> <plugin>
@ -94,34 +83,41 @@
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId> <artifactId>maven-compiler-plugin</artifactId>
<configuration> <configuration>
<source>1.5</source> <source>1.7</source>
<target>1.5</target> <target>1.7</target>
</configuration> </configuration>
</plugin> </plugin>
<plugin> <!-- usage: http://code.google.com/p/maven-license-plugin/wiki/HowTo --> <!--<plugin> &lt;!&ndash; usage: http://code.google.com/p/maven-license-plugin/wiki/HowTo &ndash;&gt;-->
<artifactId>maven-license-plugin</artifactId>
<groupId>com.mathieucarbou.mojo</groupId> <!--<groupId>com.mycila</groupId>-->
<configuration> <!--<artifactId>license-maven-plugin</artifactId>-->
<basedir>${project.parent.basedir}</basedir> <!--<version>2.11</version>-->
<header>etc/header.txt</header>
<excludes> <!--<configuration>-->
<exclude>**/*.txt</exclude> <!--<properties>-->
<exclude>**/*.info</exclude> <!--<owner>Alexander Kuznetsov</owner>-->
<exclude>**/pom.xml</exclude> <!--&lt;!&ndash;<email>mathieu.carbou@gmail.com</email>&ndash;&gt;-->
</excludes> <!--</properties>-->
<includes> <!--<basedir>${project.parent.basedir}</basedir>-->
<include>**/src/**</include> <!--<header>etc/header.txt</header>-->
</includes> <!--<excludes>-->
</configuration> <!--<exclude>**/*.txt</exclude>-->
<executions> <!--<exclude>**/*.info</exclude>-->
<execution> <!--<exclude>**/pom.xml</exclude>-->
<phase>test</phase> <!--</excludes>-->
<goals> <!--<includes>-->
<goal>check</goal> <!--<include>**/src/**</include>-->
</goals> <!--</includes>-->
</execution> <!--</configuration>-->
</executions> <!--<executions>-->
</plugin> <!--<execution>-->
<!--<phase>test</phase>-->
<!--<goals>-->
<!--<goal>check</goal>-->
<!--</goals>-->
<!--</execution>-->
<!--</executions>-->
<!--</plugin>-->
</plugins> </plugins>
</build> </build>
<profiles> <profiles>
@ -160,5 +156,6 @@
<module>dictionary-reader</module> <module>dictionary-reader</module>
<module>russian</module> <module>russian</module>
<module>english</module> <module>english</module>
<module>context</module>
</modules> </modules>
</project> </project>