adding licence

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@14 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov 2009-04-14 07:47:27 +00:00
parent fe855dfa51
commit 3f26888bde
23 changed files with 541 additions and 129 deletions

13
etc/header.txt Normal file
View File

@ -0,0 +1,13 @@
Copyright 2009 Alexander Kuznetsov
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

110
pom.xml
View File

@ -4,9 +4,27 @@
<groupId>org.apache.lucene</groupId>
<artifactId>russian-morpholgy</artifactId>
<packaging>jar</packaging>
<version>1.0-SNAPSHOT</version>
<version>0.5-SNAPSHOT</version>
<name>russian-morpholgy</name>
<url>http://maven.apache.org</url>
<distributionManagement>
<repository>
<id>russian-morpholgy</id>
<url>
../repo/releases
</url>
</repository>
<snapshotRepository>
<id>russian-morpholgy-snapshots</id>
<url>
../repo/snapshots
</url>
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
</distributionManagement>
<dependencies>
<dependency>
<groupId>junit</groupId>
@ -29,8 +47,68 @@
</dependency>
</dependencies>
<build>
<repositories>
<repository>
<id>maven2-repository.dev.java.net</id>
<name>Java.net Repository for Maven</name>
<url>http://download.java.net/maven/2/</url>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>mc-release</id>
<name>maven-license-plugin repository of releases</name>
<url>http://mc-repo.googlecode.com/svn/maven2/releases</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
<releases>
<enabled>true</enabled>
</releases>
</pluginRepository>
</pluginRepositories>
<reporting>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>cobertura-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-pmd-plugin</artifactId>
<configuration>
<linkXref>true</linkXref>
<sourceEncoding>utf-8</sourceEncoding>
<minimumTokens>100</minimumTokens>
<targetJdk>1.5</targetJdk>
</configuration>
</plugin>
</plugins>
</reporting>
<build>
<extensions>
<extension>
<groupId>org.jvnet.wagon-svn</groupId>
<artifactId>wagon-svn</artifactId>
<version>1.8</version>
</extension>
</extensions>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
@ -39,6 +117,34 @@
<target>1.5</target>
</configuration>
</plugin>
<plugin>
<!--
usage: http://code.google.com/p/maven-license-plugin/wiki/HowTo
-->
<artifactId>maven-license-plugin</artifactId>
<groupId>com.mathieucarbou.mojo</groupId>
<configuration>
<basedir>${project.parent.basedir}</basedir>
<header>etc/header.txt</header>
<excludes>
<exclude>**/*.txt</exclude>
</excludes>
<includes>
<include>**/src/**</include>
<include>**/pom.xml</include>
</includes>
</configuration>
<executions>
<execution>
<phase>test</phase>
<goals>
<goal>check</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>

View File

@ -1,13 +0,0 @@
package org.apache.lucene;
/**
* Hello world!
*
*/
public class App
{
public static void main( String[] args )
{
System.out.println( "Hello World!" );
}
}

View File

@ -1,14 +1,31 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology;
import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
import org.apache.lucene.russian.morphology.evristics.Evristic;
import org.apache.lucene.russian.morphology.evristics.StatiticsCollectors;
import org.apache.lucene.russian.morphology.evristics.SuffixCounter;
import org.apache.lucene.russian.morphology.evristics.Evristic;
import java.io.*;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Set;
public class EvristicBuilder {
@ -23,12 +40,12 @@ public class EvristicBuilder {
Object[] objects = counterCollection.toArray();
Arrays.sort(objects);
System.out.println("Length " + objects.length + " ingored words " + statiticsCollectors.getIgnoredCount());
for(int i = 0; i < 10; i++){
for (int i = 0; i < 10; i++) {
System.out.println(objects[i]);
}
final Evristic evristic = new Evristic();
for(int i = 0; i < objects.length; i++){
for (int i = 0; i < objects.length; i++) {
evristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
}

View File

@ -1,3 +1,19 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology;
/**
@ -24,7 +40,7 @@ public class RussianSuffixDecoderEncoder {
c = DASH_CODE;
}
if (c == EE_CHAR) c = E_CHAR;
if (c < 0 || c > 33) throw new WrongCharaterException();
if (c < 0 || c > 33) throw new WrongCharaterException();
result = result * 35L + c;
}
return result;
@ -44,12 +60,12 @@ public class RussianSuffixDecoderEncoder {
return result;
}
static public boolean checkCharacter(char c){
int code = 0 + c;
if(code == 45) return true;
code -= RUSSIAN_SMALL_LETTER_OFFSET;
if(code == 34) return true;
if(code > 0 && code < 33) return true;
return false;
static public boolean checkCharacter(char c) {
int code = 0 + c;
if (code == 45) return true;
code -= RUSSIAN_SMALL_LETTER_OFFSET;
if (code == 34) return true;
if (code > 0 && code < 33) return true;
return false;
}
}

View File

@ -1,3 +1,19 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology;

View File

@ -1,7 +1,23 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology;
public class WrongCharaterException extends RuntimeException{
public class WrongCharaterException extends RuntimeException {
public WrongCharaterException() {
}

View File

@ -1,15 +1,31 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.analayzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.Reader;
import java.io.IOException;
import java.io.Reader;
public class RussianMorphlogyAnalayzer extends Analyzer {
public class RussianMorphlogyAnalayzer extends Analyzer {
private SuffixEvristics suffixEvristics;
public RussianMorphlogyAnalayzer() throws IOException {
@ -20,6 +36,6 @@ public class RussianMorphlogyAnalayzer extends Analyzer {
TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
return new RussianMorphlogyFilter(result,suffixEvristics);
return new RussianMorphlogyFilter(result, suffixEvristics);
}
}

View File

@ -1,3 +1,19 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.analayzer;
import org.apache.lucene.analysis.Token;
@ -17,11 +33,11 @@ public class RussianMorphlogyFilter extends TokenFilter {
public Token next(final Token reusableToken) throws IOException {
Token nextToken = input.next(reusableToken);
if(nextToken == null || nextToken.term().length() == 0) return nextToken;
if (nextToken == null || nextToken.term().length() == 0) return nextToken;
String word = nextToken.term();
Character testC = word.charAt(0);
if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC){
return nextToken;
if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC) {
return nextToken;
}
Token current = (Token) nextToken.clone();
return createToken(suffixEvristics.getCanonicalForm(word), current, reusableToken);

View File

@ -1,10 +1,25 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.analayzer;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import java.io.*;
import java.util.Arrays;
import java.util.HashSet;
public class SuffixEvristics {
@ -46,23 +61,23 @@ public class SuffixEvristics {
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
String suffixS = form.substring(startSymbol);
if(!chechSuffix(suffixS)) return form;
if (!chechSuffix(suffixS)) return form;
Long suffix = RussianSuffixDecoderEncoder.encode(suffixS);
int index = Arrays.binarySearch(keys,suffix);
if(index < -1){
int index = Arrays.binarySearch(keys, suffix);
if (index < -1) {
System.out.println(" " + form);
return form;
}else{
} else {
String nSuffix = RussianSuffixDecoderEncoder.decode(values[index]);
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
}
}
private boolean chechSuffix(String suffix){
for(int i = 0; i < suffix.length(); i++){
private boolean chechSuffix(String suffix) {
for (int i = 0; i < suffix.length(); i++) {
if (!RussianSuffixDecoderEncoder.checkCharacter(suffix.charAt(i))) return false;
}
return true;

View File

@ -1,9 +1,28 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.dictonary;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.*;
import java.io.*;
/**
@ -15,7 +34,7 @@ public class DictonaryReader {
private String fileEncoding = "windows-1251";
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
private Set<String> ingnoredForm = new HashSet<String>();
private Set<String> ingnoredForm = new HashSet<String>();
public DictonaryReader(String fileName, Set<String> ingnoredForm) {
this.fileName = fileName;
@ -35,11 +54,11 @@ public class DictonaryReader {
sckipBlock(bufferedReader);
sckipBlock(bufferedReader);
readPrefix(bufferedReader);
readWords(bufferedReader,wordProccessor);
readWords(bufferedReader, wordProccessor);
}
private void readWords(BufferedReader reader,WordProccessor wordProccessor) throws IOException {
private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
String s = reader.readLine();
int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) {
@ -54,15 +73,15 @@ public class DictonaryReader {
if (models.size() > 0 && !ingnoredForm.contains(models.get(0).getCode())) {
WordCard card = new WordCard(cleanString(models.get(0).create(word)));
for (FlexiaModel fm : models) {
card.addFrom(cleanString(fm.create(word)));
card.addFrom(cleanString(fm.create(word)));
}
wordProccessor.proccess(card);
}
}
}
private String cleanString(String s){
return s.replace((char)(34 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET),(char)(6 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
private String cleanString(String s) {
return s.replace((char) (34 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
}
private void sckipBlock(BufferedReader reader) throws IOException {
@ -99,8 +118,8 @@ public class DictonaryReader {
private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
String[] fl = line.split("\\*");
// we inored all forms thats
// if (fl.length == 3)
// flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
// if (fl.length == 3)
// flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
}

View File

@ -1,3 +1,19 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.dictonary;
/**

View File

@ -1,11 +1,27 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.dictonary;
import java.util.Set;
import java.util.HashSet;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Set;
public class IgnoredFormReader {

View File

@ -1,7 +1,23 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.dictonary;
import java.util.List;
import java.util.ArrayList;
import java.util.List;
/**
* Represent word and all it forms.
@ -14,7 +30,7 @@ public class WordCard {
this.canonicalFrom = canonicalFrom;
}
protected void addFrom(String word){
protected void addFrom(String word) {
wordsFroms.add(word);
}

View File

@ -1,9 +1,25 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.dictonary;
import java.io.IOException;
/**
* Interface allows get information from
* Interface allows get information from
* {@org.apache.lucene.russian.morphology.dictonary.DirtonaryReader}.
*/
public interface WordProccessor {

View File

@ -1,9 +1,28 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.evristics;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import java.util.*;
import java.io.*;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.TreeMap;
public class Evristic {
@ -35,7 +54,7 @@ public class Evristic {
String s = reader.readLine();
while (s != null) {
String[] sfns = s.split(" ");
if(sfns.length == 2){
if (sfns.length == 2) {
encodedSuffixesPairs.put(Long.valueOf(sfns[0]), Long.valueOf(sfns[0]));
}
s = reader.readLine();
@ -45,8 +64,8 @@ public class Evristic {
public void writeToFile(String file) throws IOException {
FileWriter writer = new FileWriter(file);
writer.write(encodedSuffixesPairs.size()+"\n");
for(Long k:encodedSuffixesPairs.keySet()){
writer.write(encodedSuffixesPairs.size() + "\n");
for (Long k : encodedSuffixesPairs.keySet()) {
writer.write("" + k + " " + encodedSuffixesPairs.get(k) + "\n");
}
writer.close();

View File

@ -1,26 +1,42 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.evristics;
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
import org.apache.lucene.russian.morphology.dictonary.WordCard;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import org.apache.lucene.russian.morphology.dictonary.WordCard;
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
import java.util.Map;
import java.util.HashMap;
import java.util.Map;
public class StatiticsCollectors implements WordProccessor{
Map<SuffixEvristic,SuffixCounter> statititics = new HashMap<SuffixEvristic,SuffixCounter>();
public class StatiticsCollectors implements WordProccessor {
Map<SuffixEvristic, SuffixCounter> statititics = new HashMap<SuffixEvristic, SuffixCounter>();
private Integer ignoredCount = 0;
public void proccess(WordCard wordCard) {
for(String form:wordCard.getWordsFroms()){
for (String form : wordCard.getWordsFroms()) {
SuffixEvristic suffixEvristic = createEvristic(wordCard.getCanonicalFrom(), form);
if (suffixEvristic == null) continue;
SuffixCounter suffixCounter = statititics.get(suffixEvristic);
if(suffixCounter == null){
if (suffixCounter == null) {
suffixCounter = new SuffixCounter(suffixEvristic);
statititics.put(suffixEvristic,suffixCounter);
statititics.put(suffixEvristic, suffixCounter);
}
suffixCounter.incrementAmount();
}
@ -30,19 +46,19 @@ public class StatiticsCollectors implements WordProccessor{
return statititics;
}
private SuffixEvristic createEvristic(String word,String form){
private SuffixEvristic createEvristic(String word, String form) {
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
String formSuffix = form.substring(startSymbol);
if(word.length() < startSymbol){
if (word.length() < startSymbol) {
ignoredCount++;
return null;
return null;
}
String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : "";
if (wordSuffix.length() > 12){
if (wordSuffix.length() > 12) {
System.out.println(word + " " + form);
return null;
}
return new SuffixEvristic(formSuffix,wordSuffix);
return new SuffixEvristic(formSuffix, wordSuffix);
}

View File

@ -1,10 +1,26 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.evristics;
/**
* Conains information of freqency of suffix evristic
* in dictionary.
* in dictionary.
*/
public class SuffixCounter implements Comparable{
public class SuffixCounter implements Comparable {
private SuffixEvristic suffixEvristic;
private Double amnout = 0.0;
@ -12,7 +28,7 @@ public class SuffixCounter implements Comparable{
this.suffixEvristic = suffixEvristic;
}
public void incrementAmount(){
public void incrementAmount() {
amnout++;
}
@ -33,12 +49,12 @@ public class SuffixCounter implements Comparable{
}
public int compareTo(Object o) {
if(o instanceof SuffixCounter) return (int) Math.round(Math.signum(((SuffixCounter)o).amnout - amnout));
if (o instanceof SuffixCounter) return (int) Math.round(Math.signum(((SuffixCounter) o).amnout - amnout));
return -1;
}
@Override
public String toString() {
return ""+amnout + " " + suffixEvristic.toString();
return "" + amnout + " " + suffixEvristic.toString();
}
}

View File

@ -1,3 +1,19 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.evristics;
/**

View File

@ -1,39 +1,54 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology;
import org.junit.Test;
import static org.junit.Assert.assertThat;
import static org.hamcrest.core.IsEqual.equalTo;
import org.apache.lucene.russian.morphology.SuffixToLongException;
import static org.junit.Assert.assertThat;
import org.junit.Test;
import java.io.InputStream;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
public class RussianSuffixDecoderEncoderTest {
@Test
public void testShouldCorretDecodeEncode() throws IOException {
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/decoder-test-data.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8"));
String s = bufferedReader.readLine();
while(s != null){
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/decoder-test-data.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
String s = bufferedReader.readLine();
while (s != null) {
String[] qa = s.trim().split(" ");
Long ecodedSuffix = RussianSuffixDecoderEncoder.encode(qa[0]);
assertThat(RussianSuffixDecoderEncoder.decode(ecodedSuffix),equalTo(qa[1]));
assertThat(RussianSuffixDecoderEncoder.decode(ecodedSuffix), equalTo(qa[1]));
s = bufferedReader.readLine();
}
}
@Test(expected = SuffixToLongException.class)
public void shouldThrownExeptionIfSuffixToLong(){
RussianSuffixDecoderEncoder.encode("1234567890123");
public void shouldThrownExeptionIfSuffixToLong() {
RussianSuffixDecoderEncoder.encode("1234567890123");
}
@Test(expected = WrongCharaterException.class)
public void shouldThrownExeptionIfSuffixContainWrongCharater(){
RussianSuffixDecoderEncoder.encode("1");
}
public void shouldThrownExeptionIfSuffixContainWrongCharater() {
RussianSuffixDecoderEncoder.encode("1");
}
}

View File

@ -1,13 +1,28 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.analayzer;
import junit.framework.TestCase;
import org.junit.Test;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.junit.Test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.BufferedReader;
import java.io.InputStreamReader;
@ -17,24 +32,22 @@ public class RussianMorphlogyAnalayzerTest {
public void shouldCorrectProccessText() throws IOException {
RussianMorphlogyAnalayzer morphlogyAnalayzer = new RussianMorphlogyAnalayzer();
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8"));
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
final Token reusableToken = new Token();
Token nextToken;
Token nextToken;
TokenStream in = morphlogyAnalayzer.tokenStream(null, bufferedReader);
for (;;)
{
nextToken = in.next(reusableToken);
for (; ;) {
nextToken = in.next(reusableToken);
if (nextToken == null)
{
break;
}
if (nextToken == null) {
break;
}
System.out.println(nextToken.term());
System.out.println(nextToken.term());
// nextSampleToken = sample.next(reusableSampleToken);
// assertEquals(
// "Unicode",
@ -42,7 +55,7 @@ public class RussianMorphlogyAnalayzerTest {
// nextSampleToken == null
// ? null
// : nextSampleToken.term());
}
}
}
}

View File

@ -1,24 +1,42 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.analayzer;
import org.junit.Test;
import static org.junit.Assert.assertThat;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import static org.hamcrest.core.IsEqual.equalTo;
import static org.junit.Assert.assertThat;
import org.junit.Test;
import java.io.*;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
public class SuffixEvristicsTest {
@Test
public void testShouldDefineCorretCononicalWordForm() throws IOException {
SuffixEvristics suffixEvristics = new SuffixEvristics();
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8"));
String s = bufferedReader.readLine();
while(s != null){
SuffixEvristics suffixEvristics = new SuffixEvristics();
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
String s = bufferedReader.readLine();
while (s != null) {
String[] qa = s.trim().split(" ");
assertThat(suffixEvristics.getCanonicalForm(qa[0]),equalTo(qa[1]));
assertThat(suffixEvristics.getCanonicalForm(qa[0]), equalTo(qa[1]));
s = bufferedReader.readLine();
}
}

View File

@ -1,5 +1,3 @@
В условиях нарастающей пурги было сделано 4 успешных захода на посадку. После завершения облета и демонтажа оборудования
Рубен Есаян дал устную оценку эксперимента:"Все нормально, будем рекомендовать систему к внедрению".
Летом - с ноября по март - рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются
примерно один раз в две недели. Туда привозят людей, питание, оборудование, ГСМ и т.д.
что-то
В условиях нарастающей пурги было сделано 4 успешных захода на посадку. "Все нормально, будем рекомендовать систему к внедрению".
Рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются
примерно один раз в две недели.