working on prefixes hypotities

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@86 d817d54c-26ab-11de-abc9-2f7d1455ff7a
2009-11-11 22:21:14 +00:00
parent 97fa8fa868
commit 6246f020fd
10 changed files with 577 additions and 15 deletions
@@ -0,0 +1,60 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.morphology.russian;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Token;
+
+import java.io.IOException;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.util.HashSet;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: akuznetsov
+ * Date: 31.10.2009
+ * Time: 14:01:11
+ * To change this template use File | Settings | File Templates.
+ */
+public class TestSpeed {
+
+    public static void main(String[] args) throws IOException {
+        RussianAnalayzer russianAnalayzer = new RussianAnalayzer();
+        bookProccess(russianAnalayzer, "C:/tmp/_Aleksandr_Suhov_Tanets_na_raskalennyih_uglyah1.fb2");
+        Long stat = System.currentTimeMillis();
+        bookProccess(russianAnalayzer, "C:/tmp/_Aleksandr_Suhov_Tanets_na_raskalennyih_uglyah1.fb2");
+        System.out.println("Done in " + (System.currentTimeMillis() - stat));
+    }
+
+    private static void bookProccess(RussianAnalayzer russianAnalayzer, String bookName) throws IOException {
+        FileInputStream inputStream = new FileInputStream(bookName);
+        TokenStream tokenStream = russianAnalayzer.tokenStream(null,new InputStreamReader(inputStream,"UTF-8"));
+        final Token reusableToken = new Token();
+        long count = 0;
+        Token nextToken;
+        for (; ;) {
+            nextToken = tokenStream.next(reusableToken);
+           // System.out.println(" " + nextToken.term());
+            count++;
+            if (nextToken == null) {
+                break;
+            }
+
+        }
+        //System.out.println("Words " + count);
+    }
+}
@@ -18,4 +18,6 @@
 на на
 тест тест тесто
 спам спам
-спама спам
+спама спам
+наигранный  наигранный  
+наивный наивный