rallback on wrong version of morphology, adding interafce for morphology

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@88 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov
2009-11-17 14:03:59 +00:00
parent 16613c543b
commit 1273cf96ed
19 changed files with 263 additions and 1145 deletions

View File

@ -22,6 +22,6 @@ import java.io.IOException;
public class RussianLuceneMorphology extends LuceneMorphology {
public RussianLuceneMorphology() throws IOException {
super(RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"),RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/prefixes.info"), new RussianLetterDecoderEncoder());
super(RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());
}
}

View File

@ -15,11 +15,11 @@
*/
package org.apache.lucene.morphology.russian;
import org.apache.lucene.morphology.Morphology;
import org.apache.lucene.morphology.MorphologyImpl;
import java.io.IOException;
public class RussianMorphology extends Morphology {
public class RussianMorphology extends MorphologyImpl {
public RussianMorphology() throws IOException {
super(RussianMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());

View File

@ -1,60 +0,0 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.russian;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import java.io.IOException;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.HashSet;
/**
* Created by IntelliJ IDEA.
* User: akuznetsov
* Date: 31.10.2009
* Time: 14:01:11
* To change this template use File | Settings | File Templates.
*/
public class TestSpeed {
public static void main(String[] args) throws IOException {
RussianAnalayzer russianAnalayzer = new RussianAnalayzer();
bookProccess(russianAnalayzer, "C:/tmp/_Aleksandr_Suhov_Tanets_na_raskalennyih_uglyah1.fb2");
Long stat = System.currentTimeMillis();
bookProccess(russianAnalayzer, "C:/tmp/_Aleksandr_Suhov_Tanets_na_raskalennyih_uglyah1.fb2");
System.out.println("Done in " + (System.currentTimeMillis() - stat));
}
private static void bookProccess(RussianAnalayzer russianAnalayzer, String bookName) throws IOException {
FileInputStream inputStream = new FileInputStream(bookName);
TokenStream tokenStream = russianAnalayzer.tokenStream(null,new InputStreamReader(inputStream,"UTF-8"));
final Token reusableToken = new Token();
long count = 0;
Token nextToken;
for (; ;) {
nextToken = tokenStream.next(reusableToken);
// System.out.println(" " + nextToken.term());
count++;
if (nextToken == null) {
break;
}
}
//System.out.println("Words " + count);
}
}

View File

@ -1,96 +0,0 @@
11
наи
е
8
258
255
289
252
292
262
296
286
наи
и
2
263
297
наи
ю
4
250
249
283
284
по
й
5
250
251
248
247
269
по
е
3
255
252
269
наи
й
12
239
273
250
251
248
277
247
282
281
243
285
284
наи
о
6
274
253
276
287
242
240
наи
м
10
256
290
257
291
279
278
294
260
244
245
наи
х
6
259
293
261
295
264
298
наи
я
2
246
280
наи
у
4
275
254
288
241

View File

@ -33,7 +33,7 @@ public class RussianLuceneMorphTest {
@Before
public void setUp() throws IOException {
luceneMorph = new RussianLuceneMorphology();
luceneMorph = new LuceneMorphology(this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());
}
@Test

View File

@ -1,4 +1,3 @@
наилучший хороший
еду еда ехать
тестов тест
вина вино вина
@ -17,8 +16,4 @@
тосклив тоскливый
лучший хороший
на на
тест тест тесто
спам спам
спама спам
наигранный наигранный
наивный наивный
тест тест тесто