diff --git a/context/pom.xml b/context/pom.xml index 630d249..4702f88 100644 --- a/context/pom.xml +++ b/context/pom.xml @@ -1,23 +1,36 @@ - - 4.0.0 - + + 4.0.0 + + org.apache.lucene.morphology + morphology + 1.2-SNAPSHOT + org.apache.lucene.morphology - morphology - 1.2-SNAPSHOT - - org.apache.lucene.morphology - context - 1.0-SNAPSHOT - context - http://maven.apache.org - - - junit - junit - 3.8.1 - test - - + context + 1.0-SNAPSHOT + context + http://maven.apache.org + + + junit + junit + 4.8.2 + test + + + org.apache.lucene.morphology + russian + 1.2-SNAPSHOT + test + + + org.apache.lucene.morphology + english + 1.2-SNAPSHOT + test + + diff --git a/context/src/main/java/org/apache/lucene/morphology/context/CalculateContextItem.java b/context/src/main/java/org/apache/lucene/morphology/context/CalculateContextItem.java index 7d22d0a..ddf2ec8 100644 --- a/context/src/main/java/org/apache/lucene/morphology/context/CalculateContextItem.java +++ b/context/src/main/java/org/apache/lucene/morphology/context/CalculateContextItem.java @@ -1,7 +1,52 @@ +/** + * Copyright 2015 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.morphology.context; -/** - * Created by alexander on 16.06.15. - */ +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + public class CalculateContextItem { + + public List createContextItems(String text) throws IOException { + Analyzer statAnalyzer = new StatAnalyzer(); + InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год? - и что я жду тебя, где вино".getBytes()), "UTF-8"); + + +// new RussianMorphology(); + + TokenStream tokenStream = statAnalyzer.tokenStream(null, reader); + tokenStream.reset(); + + List> listedLink = new LinkedList<>(); + while (tokenStream.incrementToken()) { + CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); + PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class); + + + } + + return null; + } } diff --git a/context/src/main/java/org/apache/lucene/morphology/context/ContextItem.java b/context/src/main/java/org/apache/lucene/morphology/context/ContextItem.java index f95fde9..fa93c01 100644 --- a/context/src/main/java/org/apache/lucene/morphology/context/ContextItem.java +++ b/context/src/main/java/org/apache/lucene/morphology/context/ContextItem.java @@ -1,11 +1,24 @@ +/** + * Copyright 2015 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.morphology.context; -/** - * Created by alexander on 16.06.15. - */ -public class ContextItem { +import java.util.Arrays; + +public class ContextItem implements Comparable { String[][] morphInfo; - long count = 0; public ContextItem(String[][] morphInfo) { this.morphInfo = morphInfo; @@ -19,11 +32,49 @@ public class ContextItem { this.morphInfo = morphInfo; } - public long getCount() { - return count; + public int hashCode() { + int h = 0; + for (String[] m : morphInfo) { + for (String s : m) { + h = 31 * h + s.hashCode(); + } + } + return h; } - public void setCount(long count) { - this.count = count; + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + ContextItem that = (ContextItem) o; + + if (that.morphInfo.length != this.morphInfo.length) { + return false; + } + for (int i = 0; i < morphInfo.length; i++) { + if (!Arrays.equals(morphInfo[i], that.morphInfo[i])) { + return false; + } + } + + return true; + } + + + @Override + public int compareTo(ContextItem o) { + int i = o.morphInfo.length - morphInfo.length; + if (i != 0) return i; + for (int j = 0; j < morphInfo.length; j++) { + i = o.morphInfo[j].length - morphInfo[j].length; + if (i != 0) return i; + for (int k = 0; k < morphInfo[j].length; k++) { + i = morphInfo[j][k].compareTo(o.morphInfo[j][k]); + if (i != 0) return i; + } + } + return 0; } } diff --git a/context/src/main/java/org/apache/lucene/morphology/context/ContextStats.java b/context/src/main/java/org/apache/lucene/morphology/context/ContextStats.java index f19d022..0662dc8 100644 --- a/context/src/main/java/org/apache/lucene/morphology/context/ContextStats.java +++ b/context/src/main/java/org/apache/lucene/morphology/context/ContextStats.java @@ -1,8 +1,20 @@ +/** + * Copyright 2015 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.morphology.context; -/** - * Created by alexander on 16.06.15. - */ public class ContextStats { String[] morphInfo; double prob; diff --git a/context/src/main/java/org/apache/lucene/morphology/context/ProbClalucator.java b/context/src/main/java/org/apache/lucene/morphology/context/ProbClalucator.java index c3a0560..5014bd5 100644 --- a/context/src/main/java/org/apache/lucene/morphology/context/ProbClalucator.java +++ b/context/src/main/java/org/apache/lucene/morphology/context/ProbClalucator.java @@ -1,8 +1,20 @@ +/** + * Copyright 2015 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.morphology.context; -/** - * Created by alexander on 16.06.15. - */ public class ProbClalucator { diff --git a/context/src/main/java/org/apache/lucene/morphology/context/SimpleTokenizer.java b/context/src/main/java/org/apache/lucene/morphology/context/SimpleTokenizer.java index 385a361..7a6ab5f 100644 --- a/context/src/main/java/org/apache/lucene/morphology/context/SimpleTokenizer.java +++ b/context/src/main/java/org/apache/lucene/morphology/context/SimpleTokenizer.java @@ -1,3 +1,18 @@ +/** + * Copyright 2015 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.morphology.context; import org.apache.lucene.analysis.Tokenizer; @@ -13,9 +28,6 @@ import java.util.HashSet; import java.util.LinkedList; import java.util.Set; -/** - * Created by alexander on 16.06.15. - */ public class SimpleTokenizer extends Tokenizer { private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); diff --git a/context/src/main/java/org/apache/lucene/morphology/context/StatAnalyzer.java b/context/src/main/java/org/apache/lucene/morphology/context/StatAnalyzer.java new file mode 100644 index 0000000..23a3020 --- /dev/null +++ b/context/src/main/java/org/apache/lucene/morphology/context/StatAnalyzer.java @@ -0,0 +1,34 @@ +package org.apache.lucene.morphology.context; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.standard.StandardFilter; + +import java.io.IOException; +import java.io.Reader; + +/** + * Created by akuznetsov on 6/24/15. + */ +public class StatAnalyzer extends Analyzer { + + + + @Override + protected TokenStreamComponents createComponents(String s) { + + SimpleTokenizer src = new SimpleTokenizer(); + TokenFilter filter = new StandardFilter(src); + filter = new LowerCaseFilter(filter); + + return new TokenStreamComponents(src, filter) { + @Override + protected void setReader(final Reader reader) throws IOException { + super.setReader(reader); + } + }; + } + + +} \ No newline at end of file diff --git a/context/src/test/java/org/apache/lucene/morphology/context/SimpleTokenizerTest.java b/context/src/test/java/org/apache/lucene/morphology/context/SimpleTokenizerTest.java new file mode 100644 index 0000000..9aaf06d --- /dev/null +++ b/context/src/test/java/org/apache/lucene/morphology/context/SimpleTokenizerTest.java @@ -0,0 +1,32 @@ +package org.apache.lucene.morphology.context; + + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.junit.Test; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStreamReader; + +public class SimpleTokenizerTest { + + @Test + public void testSimpleTokenizer() throws IOException { + Analyzer statAnalyzer = new StatAnalyzer(); + InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год? - и что я жду тебя, где вино".getBytes()), "UTF-8"); + + TokenStream tokenStream = statAnalyzer.tokenStream(null, reader); + tokenStream.reset(); + + boolean wordSeen = false; + while (tokenStream.incrementToken()) { + CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); + PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class); + System.out.println(charTerm.toString()); + } + } + +} \ No newline at end of file diff --git a/etc/header.txt b/etc/header.txt index 76f2dc1..eea86d3 100644 --- a/etc/header.txt +++ b/etc/header.txt @@ -1,4 +1,4 @@ -Copyright 2009 Alexander Kuznetsov +Copyright ${project.inceptionYear} ${owner} Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java index 0e9bec1..0554577 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java +++ b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java @@ -51,25 +51,6 @@ public class MorphologyAnalyzer extends Analyzer { protected TokenStreamComponents createComponents(String s) { StandardTokenizer src = new StandardTokenizer(); - final PayloadEncoder encoder = new PayloadEncoder() { - @Override - public BytesRef encode(char[] buffer) { - final Float payload = Float.valueOf(new String(buffer)); - System.out.println(payload); - final byte[] bytes = PayloadHelper.encodeFloat(payload); - return new BytesRef(bytes, 0, bytes.length); - } - - @Override - public BytesRef encode(char[] buffer, int offset, int length) { - - final Float payload = Float.valueOf(new String(buffer, offset, length)); - System.out.println(payload); - final byte[] bytes = PayloadHelper.encodeFloat(payload); - - return new BytesRef(bytes, 0, bytes.length); - } - }; TokenFilter filter = new StandardFilter(src); filter = new LowerCaseFilter(filter); filter = new MorphologyFilter(filter, luceneMorph); diff --git a/pom.xml b/pom.xml index 8709386..3bda31c 100644 --- a/pom.xml +++ b/pom.xml @@ -1,5 +1,6 @@ - + 4.0.0 org.apache.lucene.morphology morphology @@ -64,20 +65,8 @@ - - - - mc-release - maven-license-plugin repository of releases - http://mc-repo.googlecode.com/svn/maven2/releases - - false - - - true - - - + + @@ -98,30 +87,37 @@ 1.7 - - maven-license-plugin - com.mathieucarbou.mojo - - ${project.parent.basedir} -
etc/header.txt
- - **/*.txt - **/*.info - **/pom.xml - - - **/src/** - -
- - - test - - check - - - -
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
@@ -160,6 +156,6 @@ dictionary-reader russian english - context - + context +
\ No newline at end of file