diff --git a/context/pom.xml b/context/pom.xml
index 630d249..4702f88 100644
--- a/context/pom.xml
+++ b/context/pom.xml
@@ -1,23 +1,36 @@
-
- 4.0.0
-
+
+ 4.0.0
+
+ org.apache.lucene.morphology
+ morphology
+ 1.2-SNAPSHOT
+
org.apache.lucene.morphology
- morphology
- 1.2-SNAPSHOT
-
- org.apache.lucene.morphology
- context
- 1.0-SNAPSHOT
- context
- http://maven.apache.org
-
-
- junit
- junit
- 3.8.1
- test
-
-
+ context
+ 1.0-SNAPSHOT
+ context
+ http://maven.apache.org
+
+
+ junit
+ junit
+ 4.8.2
+ test
+
+
+ org.apache.lucene.morphology
+ russian
+ 1.2-SNAPSHOT
+ test
+
+
+ org.apache.lucene.morphology
+ english
+ 1.2-SNAPSHOT
+ test
+
+
diff --git a/context/src/main/java/org/apache/lucene/morphology/context/CalculateContextItem.java b/context/src/main/java/org/apache/lucene/morphology/context/CalculateContextItem.java
index 7d22d0a..ddf2ec8 100644
--- a/context/src/main/java/org/apache/lucene/morphology/context/CalculateContextItem.java
+++ b/context/src/main/java/org/apache/lucene/morphology/context/CalculateContextItem.java
@@ -1,7 +1,52 @@
+/**
+ * Copyright 2015 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.lucene.morphology.context;
-/**
- * Created by alexander on 16.06.15.
- */
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+
public class CalculateContextItem {
+
+ public List createContextItems(String text) throws IOException {
+ Analyzer statAnalyzer = new StatAnalyzer();
+ InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год? - и что я жду тебя, где вино".getBytes()), "UTF-8");
+
+
+// new RussianMorphology();
+
+ TokenStream tokenStream = statAnalyzer.tokenStream(null, reader);
+ tokenStream.reset();
+
+ List> listedLink = new LinkedList<>();
+ while (tokenStream.incrementToken()) {
+ CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
+ PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
+
+
+ }
+
+ return null;
+ }
}
diff --git a/context/src/main/java/org/apache/lucene/morphology/context/ContextItem.java b/context/src/main/java/org/apache/lucene/morphology/context/ContextItem.java
index f95fde9..fa93c01 100644
--- a/context/src/main/java/org/apache/lucene/morphology/context/ContextItem.java
+++ b/context/src/main/java/org/apache/lucene/morphology/context/ContextItem.java
@@ -1,11 +1,24 @@
+/**
+ * Copyright 2015 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.lucene.morphology.context;
-/**
- * Created by alexander on 16.06.15.
- */
-public class ContextItem {
+import java.util.Arrays;
+
+public class ContextItem implements Comparable {
String[][] morphInfo;
- long count = 0;
public ContextItem(String[][] morphInfo) {
this.morphInfo = morphInfo;
@@ -19,11 +32,49 @@ public class ContextItem {
this.morphInfo = morphInfo;
}
- public long getCount() {
- return count;
+ public int hashCode() {
+ int h = 0;
+ for (String[] m : morphInfo) {
+ for (String s : m) {
+ h = 31 * h + s.hashCode();
+ }
+ }
+ return h;
}
- public void setCount(long count) {
- this.count = count;
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ ContextItem that = (ContextItem) o;
+
+ if (that.morphInfo.length != this.morphInfo.length) {
+ return false;
+ }
+ for (int i = 0; i < morphInfo.length; i++) {
+ if (!Arrays.equals(morphInfo[i], that.morphInfo[i])) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+
+ @Override
+ public int compareTo(ContextItem o) {
+ int i = o.morphInfo.length - morphInfo.length;
+ if (i != 0) return i;
+ for (int j = 0; j < morphInfo.length; j++) {
+ i = o.morphInfo[j].length - morphInfo[j].length;
+ if (i != 0) return i;
+ for (int k = 0; k < morphInfo[j].length; k++) {
+ i = morphInfo[j][k].compareTo(o.morphInfo[j][k]);
+ if (i != 0) return i;
+ }
+ }
+ return 0;
}
}
diff --git a/context/src/main/java/org/apache/lucene/morphology/context/ContextStats.java b/context/src/main/java/org/apache/lucene/morphology/context/ContextStats.java
index f19d022..0662dc8 100644
--- a/context/src/main/java/org/apache/lucene/morphology/context/ContextStats.java
+++ b/context/src/main/java/org/apache/lucene/morphology/context/ContextStats.java
@@ -1,8 +1,20 @@
+/**
+ * Copyright 2015 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.lucene.morphology.context;
-/**
- * Created by alexander on 16.06.15.
- */
public class ContextStats {
String[] morphInfo;
double prob;
diff --git a/context/src/main/java/org/apache/lucene/morphology/context/ProbClalucator.java b/context/src/main/java/org/apache/lucene/morphology/context/ProbClalucator.java
index c3a0560..5014bd5 100644
--- a/context/src/main/java/org/apache/lucene/morphology/context/ProbClalucator.java
+++ b/context/src/main/java/org/apache/lucene/morphology/context/ProbClalucator.java
@@ -1,8 +1,20 @@
+/**
+ * Copyright 2015 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.lucene.morphology.context;
-/**
- * Created by alexander on 16.06.15.
- */
public class ProbClalucator {
diff --git a/context/src/main/java/org/apache/lucene/morphology/context/SimpleTokenizer.java b/context/src/main/java/org/apache/lucene/morphology/context/SimpleTokenizer.java
index 385a361..7a6ab5f 100644
--- a/context/src/main/java/org/apache/lucene/morphology/context/SimpleTokenizer.java
+++ b/context/src/main/java/org/apache/lucene/morphology/context/SimpleTokenizer.java
@@ -1,3 +1,18 @@
+/**
+ * Copyright 2015 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.lucene.morphology.context;
import org.apache.lucene.analysis.Tokenizer;
@@ -13,9 +28,6 @@ import java.util.HashSet;
import java.util.LinkedList;
import java.util.Set;
-/**
- * Created by alexander on 16.06.15.
- */
public class SimpleTokenizer extends Tokenizer {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
diff --git a/context/src/main/java/org/apache/lucene/morphology/context/StatAnalyzer.java b/context/src/main/java/org/apache/lucene/morphology/context/StatAnalyzer.java
new file mode 100644
index 0000000..23a3020
--- /dev/null
+++ b/context/src/main/java/org/apache/lucene/morphology/context/StatAnalyzer.java
@@ -0,0 +1,34 @@
+package org.apache.lucene.morphology.context;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * Created by akuznetsov on 6/24/15.
+ */
+public class StatAnalyzer extends Analyzer {
+
+
+
+ @Override
+ protected TokenStreamComponents createComponents(String s) {
+
+ SimpleTokenizer src = new SimpleTokenizer();
+ TokenFilter filter = new StandardFilter(src);
+ filter = new LowerCaseFilter(filter);
+
+ return new TokenStreamComponents(src, filter) {
+ @Override
+ protected void setReader(final Reader reader) throws IOException {
+ super.setReader(reader);
+ }
+ };
+ }
+
+
+}
\ No newline at end of file
diff --git a/context/src/test/java/org/apache/lucene/morphology/context/SimpleTokenizerTest.java b/context/src/test/java/org/apache/lucene/morphology/context/SimpleTokenizerTest.java
new file mode 100644
index 0000000..9aaf06d
--- /dev/null
+++ b/context/src/test/java/org/apache/lucene/morphology/context/SimpleTokenizerTest.java
@@ -0,0 +1,32 @@
+package org.apache.lucene.morphology.context;
+
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+
+public class SimpleTokenizerTest {
+
+ @Test
+ public void testSimpleTokenizer() throws IOException {
+ Analyzer statAnalyzer = new StatAnalyzer();
+ InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год? - и что я жду тебя, где вино".getBytes()), "UTF-8");
+
+ TokenStream tokenStream = statAnalyzer.tokenStream(null, reader);
+ tokenStream.reset();
+
+ boolean wordSeen = false;
+ while (tokenStream.incrementToken()) {
+ CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
+ PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
+ System.out.println(charTerm.toString());
+ }
+ }
+
+}
\ No newline at end of file
diff --git a/etc/header.txt b/etc/header.txt
index 76f2dc1..eea86d3 100644
--- a/etc/header.txt
+++ b/etc/header.txt
@@ -1,4 +1,4 @@
-Copyright 2009 Alexander Kuznetsov
+Copyright ${project.inceptionYear} ${owner}
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
diff --git a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java
index 0e9bec1..0554577 100644
--- a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java
+++ b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java
@@ -51,25 +51,6 @@ public class MorphologyAnalyzer extends Analyzer {
protected TokenStreamComponents createComponents(String s) {
StandardTokenizer src = new StandardTokenizer();
- final PayloadEncoder encoder = new PayloadEncoder() {
- @Override
- public BytesRef encode(char[] buffer) {
- final Float payload = Float.valueOf(new String(buffer));
- System.out.println(payload);
- final byte[] bytes = PayloadHelper.encodeFloat(payload);
- return new BytesRef(bytes, 0, bytes.length);
- }
-
- @Override
- public BytesRef encode(char[] buffer, int offset, int length) {
-
- final Float payload = Float.valueOf(new String(buffer, offset, length));
- System.out.println(payload);
- final byte[] bytes = PayloadHelper.encodeFloat(payload);
-
- return new BytesRef(bytes, 0, bytes.length);
- }
- };
TokenFilter filter = new StandardFilter(src);
filter = new LowerCaseFilter(filter);
filter = new MorphologyFilter(filter, luceneMorph);
diff --git a/pom.xml b/pom.xml
index 8709386..3bda31c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,5 +1,6 @@
-
+
4.0.0
org.apache.lucene.morphology
morphology
@@ -64,20 +65,8 @@
-
-
-
- mc-release
- maven-license-plugin repository of releases
- http://mc-repo.googlecode.com/svn/maven2/releases
-
- false
-
-
- true
-
-
-
+
+
@@ -98,30 +87,37 @@
1.7
-
- maven-license-plugin
- com.mathieucarbou.mojo
-
- ${project.parent.basedir}
-
-
- **/*.txt
- **/*.info
- **/pom.xml
-
-
- **/src/**
-
-
-
-
- test
-
- check
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -160,6 +156,6 @@
dictionary-reader
russian
english
- context
-
+ context
+
\ No newline at end of file