adding licence
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@14 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
fe855dfa51
commit
3f26888bde
13
etc/header.txt
Normal file
13
etc/header.txt
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
Copyright 2009 Alexander Kuznetsov
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
110
pom.xml
110
pom.xml
@ -4,9 +4,27 @@
|
|||||||
<groupId>org.apache.lucene</groupId>
|
<groupId>org.apache.lucene</groupId>
|
||||||
<artifactId>russian-morpholgy</artifactId>
|
<artifactId>russian-morpholgy</artifactId>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
<version>1.0-SNAPSHOT</version>
|
<version>0.5-SNAPSHOT</version>
|
||||||
<name>russian-morpholgy</name>
|
<name>russian-morpholgy</name>
|
||||||
<url>http://maven.apache.org</url>
|
<url>http://maven.apache.org</url>
|
||||||
|
|
||||||
|
<distributionManagement>
|
||||||
|
<repository>
|
||||||
|
<id>russian-morpholgy</id>
|
||||||
|
<url>
|
||||||
|
../repo/releases
|
||||||
|
</url>
|
||||||
|
</repository>
|
||||||
|
<snapshotRepository>
|
||||||
|
<id>russian-morpholgy-snapshots</id>
|
||||||
|
<url>
|
||||||
|
../repo/snapshots
|
||||||
|
</url>
|
||||||
|
<uniqueVersion>true</uniqueVersion>
|
||||||
|
</snapshotRepository>
|
||||||
|
</distributionManagement>
|
||||||
|
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>junit</groupId>
|
<groupId>junit</groupId>
|
||||||
@ -29,8 +47,68 @@
|
|||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
<build>
|
<repositories>
|
||||||
|
<repository>
|
||||||
|
<id>maven2-repository.dev.java.net</id>
|
||||||
|
<name>Java.net Repository for Maven</name>
|
||||||
|
<url>http://download.java.net/maven/2/</url>
|
||||||
|
</repository>
|
||||||
|
</repositories>
|
||||||
|
|
||||||
|
<pluginRepositories>
|
||||||
|
<pluginRepository>
|
||||||
|
<id>mc-release</id>
|
||||||
|
<name>maven-license-plugin repository of releases</name>
|
||||||
|
<url>http://mc-repo.googlecode.com/svn/maven2/releases</url>
|
||||||
|
|
||||||
|
<snapshots>
|
||||||
|
<enabled>false</enabled>
|
||||||
|
</snapshots>
|
||||||
|
|
||||||
|
<releases>
|
||||||
|
<enabled>true</enabled>
|
||||||
|
</releases>
|
||||||
|
</pluginRepository>
|
||||||
|
</pluginRepositories>
|
||||||
|
|
||||||
|
<reporting>
|
||||||
<plugins>
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-javadoc-plugin</artifactId>
|
||||||
|
</plugin>
|
||||||
|
|
||||||
|
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>cobertura-maven-plugin</artifactId>
|
||||||
|
</plugin>
|
||||||
|
|
||||||
|
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-pmd-plugin</artifactId>
|
||||||
|
<configuration>
|
||||||
|
<linkXref>true</linkXref>
|
||||||
|
<sourceEncoding>utf-8</sourceEncoding>
|
||||||
|
<minimumTokens>100</minimumTokens>
|
||||||
|
<targetJdk>1.5</targetJdk>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</reporting>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<extensions>
|
||||||
|
<extension>
|
||||||
|
<groupId>org.jvnet.wagon-svn</groupId>
|
||||||
|
<artifactId>wagon-svn</artifactId>
|
||||||
|
<version>1.8</version>
|
||||||
|
</extension>
|
||||||
|
</extensions>
|
||||||
|
|
||||||
|
<plugins>
|
||||||
|
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
<artifactId>maven-compiler-plugin</artifactId>
|
<artifactId>maven-compiler-plugin</artifactId>
|
||||||
@ -39,6 +117,34 @@
|
|||||||
<target>1.5</target>
|
<target>1.5</target>
|
||||||
</configuration>
|
</configuration>
|
||||||
</plugin>
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<!--
|
||||||
|
usage: http://code.google.com/p/maven-license-plugin/wiki/HowTo
|
||||||
|
-->
|
||||||
|
<artifactId>maven-license-plugin</artifactId>
|
||||||
|
<groupId>com.mathieucarbou.mojo</groupId>
|
||||||
|
|
||||||
|
<configuration>
|
||||||
|
<basedir>${project.parent.basedir}</basedir>
|
||||||
|
<header>etc/header.txt</header>
|
||||||
|
<excludes>
|
||||||
|
<exclude>**/*.txt</exclude>
|
||||||
|
</excludes>
|
||||||
|
<includes>
|
||||||
|
<include>**/src/**</include>
|
||||||
|
<include>**/pom.xml</include>
|
||||||
|
</includes>
|
||||||
|
</configuration>
|
||||||
|
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>check</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
</plugins>
|
</plugins>
|
||||||
</build>
|
</build>
|
||||||
|
|
||||||
|
@ -1,13 +0,0 @@
|
|||||||
package org.apache.lucene;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Hello world!
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public class App
|
|
||||||
{
|
|
||||||
public static void main( String[] args )
|
|
||||||
{
|
|
||||||
System.out.println( "Hello World!" );
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,14 +1,31 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology;
|
package org.apache.lucene.russian.morphology;
|
||||||
|
|
||||||
import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
|
import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
|
||||||
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
|
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
|
||||||
|
import org.apache.lucene.russian.morphology.evristics.Evristic;
|
||||||
import org.apache.lucene.russian.morphology.evristics.StatiticsCollectors;
|
import org.apache.lucene.russian.morphology.evristics.StatiticsCollectors;
|
||||||
import org.apache.lucene.russian.morphology.evristics.SuffixCounter;
|
import org.apache.lucene.russian.morphology.evristics.SuffixCounter;
|
||||||
import org.apache.lucene.russian.morphology.evristics.Evristic;
|
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.IOException;
|
||||||
import java.util.*;
|
import java.util.Arrays;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.Collection;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
|
||||||
public class EvristicBuilder {
|
public class EvristicBuilder {
|
||||||
@ -23,12 +40,12 @@ public class EvristicBuilder {
|
|||||||
Object[] objects = counterCollection.toArray();
|
Object[] objects = counterCollection.toArray();
|
||||||
Arrays.sort(objects);
|
Arrays.sort(objects);
|
||||||
System.out.println("Length " + objects.length + " ingored words " + statiticsCollectors.getIgnoredCount());
|
System.out.println("Length " + objects.length + " ingored words " + statiticsCollectors.getIgnoredCount());
|
||||||
for(int i = 0; i < 10; i++){
|
for (int i = 0; i < 10; i++) {
|
||||||
System.out.println(objects[i]);
|
System.out.println(objects[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
final Evristic evristic = new Evristic();
|
final Evristic evristic = new Evristic();
|
||||||
for(int i = 0; i < objects.length; i++){
|
for (int i = 0; i < objects.length; i++) {
|
||||||
evristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
|
evristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,3 +1,19 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology;
|
package org.apache.lucene.russian.morphology;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -44,12 +60,12 @@ public class RussianSuffixDecoderEncoder {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static public boolean checkCharacter(char c){
|
static public boolean checkCharacter(char c) {
|
||||||
int code = 0 + c;
|
int code = 0 + c;
|
||||||
if(code == 45) return true;
|
if (code == 45) return true;
|
||||||
code -= RUSSIAN_SMALL_LETTER_OFFSET;
|
code -= RUSSIAN_SMALL_LETTER_OFFSET;
|
||||||
if(code == 34) return true;
|
if (code == 34) return true;
|
||||||
if(code > 0 && code < 33) return true;
|
if (code > 0 && code < 33) return true;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,3 +1,19 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology;
|
package org.apache.lucene.russian.morphology;
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,7 +1,23 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology;
|
package org.apache.lucene.russian.morphology;
|
||||||
|
|
||||||
|
|
||||||
public class WrongCharaterException extends RuntimeException{
|
public class WrongCharaterException extends RuntimeException {
|
||||||
public WrongCharaterException() {
|
public WrongCharaterException() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,13 +1,29 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology.analayzer;
|
package org.apache.lucene.russian.morphology.analayzer;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
|
||||||
import java.io.Reader;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
public class RussianMorphlogyAnalayzer extends Analyzer {
|
public class RussianMorphlogyAnalayzer extends Analyzer {
|
||||||
private SuffixEvristics suffixEvristics;
|
private SuffixEvristics suffixEvristics;
|
||||||
@ -20,6 +36,6 @@ public class RussianMorphlogyAnalayzer extends Analyzer {
|
|||||||
TokenStream result = new StandardTokenizer(reader);
|
TokenStream result = new StandardTokenizer(reader);
|
||||||
result = new StandardFilter(result);
|
result = new StandardFilter(result);
|
||||||
result = new LowerCaseFilter(result);
|
result = new LowerCaseFilter(result);
|
||||||
return new RussianMorphlogyFilter(result,suffixEvristics);
|
return new RussianMorphlogyFilter(result, suffixEvristics);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,3 +1,19 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology.analayzer;
|
package org.apache.lucene.russian.morphology.analayzer;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
@ -17,10 +33,10 @@ public class RussianMorphlogyFilter extends TokenFilter {
|
|||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public Token next(final Token reusableToken) throws IOException {
|
||||||
Token nextToken = input.next(reusableToken);
|
Token nextToken = input.next(reusableToken);
|
||||||
if(nextToken == null || nextToken.term().length() == 0) return nextToken;
|
if (nextToken == null || nextToken.term().length() == 0) return nextToken;
|
||||||
String word = nextToken.term();
|
String word = nextToken.term();
|
||||||
Character testC = word.charAt(0);
|
Character testC = word.charAt(0);
|
||||||
if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC){
|
if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC) {
|
||||||
return nextToken;
|
return nextToken;
|
||||||
}
|
}
|
||||||
Token current = (Token) nextToken.clone();
|
Token current = (Token) nextToken.clone();
|
||||||
|
@ -1,10 +1,25 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology.analayzer;
|
package org.apache.lucene.russian.morphology.analayzer;
|
||||||
|
|
||||||
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.HashSet;
|
|
||||||
|
|
||||||
|
|
||||||
public class SuffixEvristics {
|
public class SuffixEvristics {
|
||||||
@ -46,23 +61,23 @@ public class SuffixEvristics {
|
|||||||
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
||||||
String suffixS = form.substring(startSymbol);
|
String suffixS = form.substring(startSymbol);
|
||||||
|
|
||||||
if(!chechSuffix(suffixS)) return form;
|
if (!chechSuffix(suffixS)) return form;
|
||||||
|
|
||||||
Long suffix = RussianSuffixDecoderEncoder.encode(suffixS);
|
Long suffix = RussianSuffixDecoderEncoder.encode(suffixS);
|
||||||
|
|
||||||
int index = Arrays.binarySearch(keys,suffix);
|
int index = Arrays.binarySearch(keys, suffix);
|
||||||
if(index < -1){
|
if (index < -1) {
|
||||||
System.out.println(" " + form);
|
System.out.println(" " + form);
|
||||||
return form;
|
return form;
|
||||||
}else{
|
} else {
|
||||||
String nSuffix = RussianSuffixDecoderEncoder.decode(values[index]);
|
String nSuffix = RussianSuffixDecoderEncoder.decode(values[index]);
|
||||||
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
|
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean chechSuffix(String suffix){
|
private boolean chechSuffix(String suffix) {
|
||||||
for(int i = 0; i < suffix.length(); i++){
|
for (int i = 0; i < suffix.length(); i++) {
|
||||||
if (!RussianSuffixDecoderEncoder.checkCharacter(suffix.charAt(i))) return false;
|
if (!RussianSuffixDecoderEncoder.checkCharacter(suffix.charAt(i))) return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
@ -1,9 +1,28 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology.dictonary;
|
package org.apache.lucene.russian.morphology.dictonary;
|
||||||
|
|
||||||
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.io.*;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -35,11 +54,11 @@ public class DictonaryReader {
|
|||||||
sckipBlock(bufferedReader);
|
sckipBlock(bufferedReader);
|
||||||
sckipBlock(bufferedReader);
|
sckipBlock(bufferedReader);
|
||||||
readPrefix(bufferedReader);
|
readPrefix(bufferedReader);
|
||||||
readWords(bufferedReader,wordProccessor);
|
readWords(bufferedReader, wordProccessor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void readWords(BufferedReader reader,WordProccessor wordProccessor) throws IOException {
|
private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
|
||||||
String s = reader.readLine();
|
String s = reader.readLine();
|
||||||
int count = Integer.valueOf(s);
|
int count = Integer.valueOf(s);
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
@ -61,8 +80,8 @@ public class DictonaryReader {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private String cleanString(String s){
|
private String cleanString(String s) {
|
||||||
return s.replace((char)(34 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET),(char)(6 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
|
return s.replace((char) (34 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
|
||||||
}
|
}
|
||||||
|
|
||||||
private void sckipBlock(BufferedReader reader) throws IOException {
|
private void sckipBlock(BufferedReader reader) throws IOException {
|
||||||
|
@ -1,3 +1,19 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology.dictonary;
|
package org.apache.lucene.russian.morphology.dictonary;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1,11 +1,27 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology.dictonary;
|
package org.apache.lucene.russian.morphology.dictonary;
|
||||||
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
|
||||||
public class IgnoredFormReader {
|
public class IgnoredFormReader {
|
||||||
|
@ -1,7 +1,23 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology.dictonary;
|
package org.apache.lucene.russian.morphology.dictonary;
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Represent word and all it forms.
|
* Represent word and all it forms.
|
||||||
@ -14,7 +30,7 @@ public class WordCard {
|
|||||||
this.canonicalFrom = canonicalFrom;
|
this.canonicalFrom = canonicalFrom;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void addFrom(String word){
|
protected void addFrom(String word) {
|
||||||
wordsFroms.add(word);
|
wordsFroms.add(word);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,3 +1,19 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology.dictonary;
|
package org.apache.lucene.russian.morphology.dictonary;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -1,9 +1,28 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology.evristics;
|
package org.apache.lucene.russian.morphology.evristics;
|
||||||
|
|
||||||
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||||
|
|
||||||
import java.util.*;
|
import java.io.BufferedReader;
|
||||||
import java.io.*;
|
import java.io.FileReader;
|
||||||
|
import java.io.FileWriter;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
|
||||||
|
|
||||||
public class Evristic {
|
public class Evristic {
|
||||||
@ -35,7 +54,7 @@ public class Evristic {
|
|||||||
String s = reader.readLine();
|
String s = reader.readLine();
|
||||||
while (s != null) {
|
while (s != null) {
|
||||||
String[] sfns = s.split(" ");
|
String[] sfns = s.split(" ");
|
||||||
if(sfns.length == 2){
|
if (sfns.length == 2) {
|
||||||
encodedSuffixesPairs.put(Long.valueOf(sfns[0]), Long.valueOf(sfns[0]));
|
encodedSuffixesPairs.put(Long.valueOf(sfns[0]), Long.valueOf(sfns[0]));
|
||||||
}
|
}
|
||||||
s = reader.readLine();
|
s = reader.readLine();
|
||||||
@ -45,8 +64,8 @@ public class Evristic {
|
|||||||
|
|
||||||
public void writeToFile(String file) throws IOException {
|
public void writeToFile(String file) throws IOException {
|
||||||
FileWriter writer = new FileWriter(file);
|
FileWriter writer = new FileWriter(file);
|
||||||
writer.write(encodedSuffixesPairs.size()+"\n");
|
writer.write(encodedSuffixesPairs.size() + "\n");
|
||||||
for(Long k:encodedSuffixesPairs.keySet()){
|
for (Long k : encodedSuffixesPairs.keySet()) {
|
||||||
writer.write("" + k + " " + encodedSuffixesPairs.get(k) + "\n");
|
writer.write("" + k + " " + encodedSuffixesPairs.get(k) + "\n");
|
||||||
}
|
}
|
||||||
writer.close();
|
writer.close();
|
||||||
|
@ -1,26 +1,42 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology.evristics;
|
package org.apache.lucene.russian.morphology.evristics;
|
||||||
|
|
||||||
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
|
|
||||||
import org.apache.lucene.russian.morphology.dictonary.WordCard;
|
|
||||||
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||||
|
import org.apache.lucene.russian.morphology.dictonary.WordCard;
|
||||||
|
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
|
||||||
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
|
||||||
public class StatiticsCollectors implements WordProccessor{
|
public class StatiticsCollectors implements WordProccessor {
|
||||||
Map<SuffixEvristic,SuffixCounter> statititics = new HashMap<SuffixEvristic,SuffixCounter>();
|
Map<SuffixEvristic, SuffixCounter> statititics = new HashMap<SuffixEvristic, SuffixCounter>();
|
||||||
|
|
||||||
private Integer ignoredCount = 0;
|
private Integer ignoredCount = 0;
|
||||||
|
|
||||||
public void proccess(WordCard wordCard) {
|
public void proccess(WordCard wordCard) {
|
||||||
for(String form:wordCard.getWordsFroms()){
|
for (String form : wordCard.getWordsFroms()) {
|
||||||
SuffixEvristic suffixEvristic = createEvristic(wordCard.getCanonicalFrom(), form);
|
SuffixEvristic suffixEvristic = createEvristic(wordCard.getCanonicalFrom(), form);
|
||||||
if (suffixEvristic == null) continue;
|
if (suffixEvristic == null) continue;
|
||||||
SuffixCounter suffixCounter = statititics.get(suffixEvristic);
|
SuffixCounter suffixCounter = statititics.get(suffixEvristic);
|
||||||
if(suffixCounter == null){
|
if (suffixCounter == null) {
|
||||||
suffixCounter = new SuffixCounter(suffixEvristic);
|
suffixCounter = new SuffixCounter(suffixEvristic);
|
||||||
statititics.put(suffixEvristic,suffixCounter);
|
statititics.put(suffixEvristic, suffixCounter);
|
||||||
}
|
}
|
||||||
suffixCounter.incrementAmount();
|
suffixCounter.incrementAmount();
|
||||||
}
|
}
|
||||||
@ -30,19 +46,19 @@ public class StatiticsCollectors implements WordProccessor{
|
|||||||
return statititics;
|
return statititics;
|
||||||
}
|
}
|
||||||
|
|
||||||
private SuffixEvristic createEvristic(String word,String form){
|
private SuffixEvristic createEvristic(String word, String form) {
|
||||||
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
||||||
String formSuffix = form.substring(startSymbol);
|
String formSuffix = form.substring(startSymbol);
|
||||||
if(word.length() < startSymbol){
|
if (word.length() < startSymbol) {
|
||||||
ignoredCount++;
|
ignoredCount++;
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : "";
|
String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : "";
|
||||||
if (wordSuffix.length() > 12){
|
if (wordSuffix.length() > 12) {
|
||||||
System.out.println(word + " " + form);
|
System.out.println(word + " " + form);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
return new SuffixEvristic(formSuffix,wordSuffix);
|
return new SuffixEvristic(formSuffix, wordSuffix);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,10 +1,26 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology.evristics;
|
package org.apache.lucene.russian.morphology.evristics;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Conains information of freqency of suffix evristic
|
* Conains information of freqency of suffix evristic
|
||||||
* in dictionary.
|
* in dictionary.
|
||||||
*/
|
*/
|
||||||
public class SuffixCounter implements Comparable{
|
public class SuffixCounter implements Comparable {
|
||||||
private SuffixEvristic suffixEvristic;
|
private SuffixEvristic suffixEvristic;
|
||||||
private Double amnout = 0.0;
|
private Double amnout = 0.0;
|
||||||
|
|
||||||
@ -12,7 +28,7 @@ public class SuffixCounter implements Comparable{
|
|||||||
this.suffixEvristic = suffixEvristic;
|
this.suffixEvristic = suffixEvristic;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void incrementAmount(){
|
public void incrementAmount() {
|
||||||
amnout++;
|
amnout++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -33,12 +49,12 @@ public class SuffixCounter implements Comparable{
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int compareTo(Object o) {
|
public int compareTo(Object o) {
|
||||||
if(o instanceof SuffixCounter) return (int) Math.round(Math.signum(((SuffixCounter)o).amnout - amnout));
|
if (o instanceof SuffixCounter) return (int) Math.round(Math.signum(((SuffixCounter) o).amnout - amnout));
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return ""+amnout + " " + suffixEvristic.toString();
|
return "" + amnout + " " + suffixEvristic.toString();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,3 +1,19 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology.evristics;
|
package org.apache.lucene.russian.morphology.evristics;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1,14 +1,29 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology;
|
package org.apache.lucene.russian.morphology;
|
||||||
|
|
||||||
import org.junit.Test;
|
|
||||||
import static org.junit.Assert.assertThat;
|
|
||||||
import static org.hamcrest.core.IsEqual.equalTo;
|
import static org.hamcrest.core.IsEqual.equalTo;
|
||||||
import org.apache.lucene.russian.morphology.SuffixToLongException;
|
import static org.junit.Assert.assertThat;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
|
||||||
|
|
||||||
public class RussianSuffixDecoderEncoderTest {
|
public class RussianSuffixDecoderEncoderTest {
|
||||||
@ -16,23 +31,23 @@ public class RussianSuffixDecoderEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testShouldCorretDecodeEncode() throws IOException {
|
public void testShouldCorretDecodeEncode() throws IOException {
|
||||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/decoder-test-data.txt");
|
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/decoder-test-data.txt");
|
||||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8"));
|
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||||
String s = bufferedReader.readLine();
|
String s = bufferedReader.readLine();
|
||||||
while(s != null){
|
while (s != null) {
|
||||||
String[] qa = s.trim().split(" ");
|
String[] qa = s.trim().split(" ");
|
||||||
Long ecodedSuffix = RussianSuffixDecoderEncoder.encode(qa[0]);
|
Long ecodedSuffix = RussianSuffixDecoderEncoder.encode(qa[0]);
|
||||||
assertThat(RussianSuffixDecoderEncoder.decode(ecodedSuffix),equalTo(qa[1]));
|
assertThat(RussianSuffixDecoderEncoder.decode(ecodedSuffix), equalTo(qa[1]));
|
||||||
s = bufferedReader.readLine();
|
s = bufferedReader.readLine();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(expected = SuffixToLongException.class)
|
@Test(expected = SuffixToLongException.class)
|
||||||
public void shouldThrownExeptionIfSuffixToLong(){
|
public void shouldThrownExeptionIfSuffixToLong() {
|
||||||
RussianSuffixDecoderEncoder.encode("1234567890123");
|
RussianSuffixDecoderEncoder.encode("1234567890123");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(expected = WrongCharaterException.class)
|
@Test(expected = WrongCharaterException.class)
|
||||||
public void shouldThrownExeptionIfSuffixContainWrongCharater(){
|
public void shouldThrownExeptionIfSuffixContainWrongCharater() {
|
||||||
RussianSuffixDecoderEncoder.encode("1");
|
RussianSuffixDecoderEncoder.encode("1");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,13 +1,28 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology.analayzer;
|
package org.apache.lucene.russian.morphology.analayzer;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
|
||||||
import org.junit.Test;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
|
|
||||||
|
|
||||||
@ -17,7 +32,7 @@ public class RussianMorphlogyAnalayzerTest {
|
|||||||
public void shouldCorrectProccessText() throws IOException {
|
public void shouldCorrectProccessText() throws IOException {
|
||||||
RussianMorphlogyAnalayzer morphlogyAnalayzer = new RussianMorphlogyAnalayzer();
|
RussianMorphlogyAnalayzer morphlogyAnalayzer = new RussianMorphlogyAnalayzer();
|
||||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt");
|
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt");
|
||||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8"));
|
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||||
|
|
||||||
final Token reusableToken = new Token();
|
final Token reusableToken = new Token();
|
||||||
|
|
||||||
@ -25,12 +40,10 @@ public class RussianMorphlogyAnalayzerTest {
|
|||||||
|
|
||||||
|
|
||||||
TokenStream in = morphlogyAnalayzer.tokenStream(null, bufferedReader);
|
TokenStream in = morphlogyAnalayzer.tokenStream(null, bufferedReader);
|
||||||
for (;;)
|
for (; ;) {
|
||||||
{
|
|
||||||
nextToken = in.next(reusableToken);
|
nextToken = in.next(reusableToken);
|
||||||
|
|
||||||
if (nextToken == null)
|
if (nextToken == null) {
|
||||||
{
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,11 +1,29 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology.analayzer;
|
package org.apache.lucene.russian.morphology.analayzer;
|
||||||
|
|
||||||
import org.junit.Test;
|
|
||||||
import static org.junit.Assert.assertThat;
|
|
||||||
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
|
||||||
import static org.hamcrest.core.IsEqual.equalTo;
|
import static org.hamcrest.core.IsEqual.equalTo;
|
||||||
|
import static org.junit.Assert.assertThat;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
|
||||||
|
|
||||||
public class SuffixEvristicsTest {
|
public class SuffixEvristicsTest {
|
||||||
@ -14,11 +32,11 @@ public class SuffixEvristicsTest {
|
|||||||
public void testShouldDefineCorretCononicalWordForm() throws IOException {
|
public void testShouldDefineCorretCononicalWordForm() throws IOException {
|
||||||
SuffixEvristics suffixEvristics = new SuffixEvristics();
|
SuffixEvristics suffixEvristics = new SuffixEvristics();
|
||||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt");
|
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt");
|
||||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8"));
|
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||||
String s = bufferedReader.readLine();
|
String s = bufferedReader.readLine();
|
||||||
while(s != null){
|
while (s != null) {
|
||||||
String[] qa = s.trim().split(" ");
|
String[] qa = s.trim().split(" ");
|
||||||
assertThat(suffixEvristics.getCanonicalForm(qa[0]),equalTo(qa[1]));
|
assertThat(suffixEvristics.getCanonicalForm(qa[0]), equalTo(qa[1]));
|
||||||
s = bufferedReader.readLine();
|
s = bufferedReader.readLine();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
В условиях нарастающей пурги было сделано 4 успешных захода на посадку. После завершения облета и демонтажа оборудования
|
В условиях нарастающей пурги было сделано 4 успешных захода на посадку. "Все нормально, будем рекомендовать систему к внедрению".
|
||||||
Рубен Есаян дал устную оценку эксперимента:"Все нормально, будем рекомендовать систему к внедрению".
|
Рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются
|
||||||
Летом - с ноября по март - рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются
|
примерно один раз в две недели.
|
||||||
примерно один раз в две недели. Туда привозят людей, питание, оборудование, ГСМ и т.д.
|
|
||||||
что-то
|
|
Loading…
x
Reference in New Issue
Block a user