adding licence

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@14 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov 2009-04-14 07:47:27 +00:00
parent fe855dfa51
commit 3f26888bde
23 changed files with 541 additions and 129 deletions

13
etc/header.txt Normal file
View File

@ -0,0 +1,13 @@
Copyright 2009 Alexander Kuznetsov
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

110
pom.xml
View File

@ -4,9 +4,27 @@
<groupId>org.apache.lucene</groupId> <groupId>org.apache.lucene</groupId>
<artifactId>russian-morpholgy</artifactId> <artifactId>russian-morpholgy</artifactId>
<packaging>jar</packaging> <packaging>jar</packaging>
<version>1.0-SNAPSHOT</version> <version>0.5-SNAPSHOT</version>
<name>russian-morpholgy</name> <name>russian-morpholgy</name>
<url>http://maven.apache.org</url> <url>http://maven.apache.org</url>
<distributionManagement>
<repository>
<id>russian-morpholgy</id>
<url>
../repo/releases
</url>
</repository>
<snapshotRepository>
<id>russian-morpholgy-snapshots</id>
<url>
../repo/snapshots
</url>
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
</distributionManagement>
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>junit</groupId> <groupId>junit</groupId>
@ -29,8 +47,68 @@
</dependency> </dependency>
</dependencies> </dependencies>
<build> <repositories>
<repository>
<id>maven2-repository.dev.java.net</id>
<name>Java.net Repository for Maven</name>
<url>http://download.java.net/maven/2/</url>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>mc-release</id>
<name>maven-license-plugin repository of releases</name>
<url>http://mc-repo.googlecode.com/svn/maven2/releases</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
<releases>
<enabled>true</enabled>
</releases>
</pluginRepository>
</pluginRepositories>
<reporting>
<plugins> <plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>cobertura-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-pmd-plugin</artifactId>
<configuration>
<linkXref>true</linkXref>
<sourceEncoding>utf-8</sourceEncoding>
<minimumTokens>100</minimumTokens>
<targetJdk>1.5</targetJdk>
</configuration>
</plugin>
</plugins>
</reporting>
<build>
<extensions>
<extension>
<groupId>org.jvnet.wagon-svn</groupId>
<artifactId>wagon-svn</artifactId>
<version>1.8</version>
</extension>
</extensions>
<plugins>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId> <artifactId>maven-compiler-plugin</artifactId>
@ -39,6 +117,34 @@
<target>1.5</target> <target>1.5</target>
</configuration> </configuration>
</plugin> </plugin>
<plugin>
<!--
usage: http://code.google.com/p/maven-license-plugin/wiki/HowTo
-->
<artifactId>maven-license-plugin</artifactId>
<groupId>com.mathieucarbou.mojo</groupId>
<configuration>
<basedir>${project.parent.basedir}</basedir>
<header>etc/header.txt</header>
<excludes>
<exclude>**/*.txt</exclude>
</excludes>
<includes>
<include>**/src/**</include>
<include>**/pom.xml</include>
</includes>
</configuration>
<executions>
<execution>
<phase>test</phase>
<goals>
<goal>check</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins> </plugins>
</build> </build>

View File

@ -1,13 +0,0 @@
package org.apache.lucene;
/**
* Hello world!
*
*/
public class App
{
public static void main( String[] args )
{
System.out.println( "Hello World!" );
}
}

View File

@ -1,14 +1,31 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology; package org.apache.lucene.russian.morphology;
import org.apache.lucene.russian.morphology.dictonary.DictonaryReader; import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader; import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
import org.apache.lucene.russian.morphology.evristics.Evristic;
import org.apache.lucene.russian.morphology.evristics.StatiticsCollectors; import org.apache.lucene.russian.morphology.evristics.StatiticsCollectors;
import org.apache.lucene.russian.morphology.evristics.SuffixCounter; import org.apache.lucene.russian.morphology.evristics.SuffixCounter;
import org.apache.lucene.russian.morphology.evristics.Evristic;
import java.io.*; import java.io.IOException;
import java.util.*; import java.util.Arrays;
import java.util.concurrent.atomic.AtomicInteger; import java.util.Collection;
import java.util.Set;
public class EvristicBuilder { public class EvristicBuilder {
@ -23,12 +40,12 @@ public class EvristicBuilder {
Object[] objects = counterCollection.toArray(); Object[] objects = counterCollection.toArray();
Arrays.sort(objects); Arrays.sort(objects);
System.out.println("Length " + objects.length + " ingored words " + statiticsCollectors.getIgnoredCount()); System.out.println("Length " + objects.length + " ingored words " + statiticsCollectors.getIgnoredCount());
for(int i = 0; i < 10; i++){ for (int i = 0; i < 10; i++) {
System.out.println(objects[i]); System.out.println(objects[i]);
} }
final Evristic evristic = new Evristic(); final Evristic evristic = new Evristic();
for(int i = 0; i < objects.length; i++){ for (int i = 0; i < objects.length; i++) {
evristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic()); evristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
} }

View File

@ -1,3 +1,19 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology; package org.apache.lucene.russian.morphology;
/** /**
@ -44,12 +60,12 @@ public class RussianSuffixDecoderEncoder {
return result; return result;
} }
static public boolean checkCharacter(char c){ static public boolean checkCharacter(char c) {
int code = 0 + c; int code = 0 + c;
if(code == 45) return true; if (code == 45) return true;
code -= RUSSIAN_SMALL_LETTER_OFFSET; code -= RUSSIAN_SMALL_LETTER_OFFSET;
if(code == 34) return true; if (code == 34) return true;
if(code > 0 && code < 33) return true; if (code > 0 && code < 33) return true;
return false; return false;
} }
} }

View File

@ -1,3 +1,19 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology; package org.apache.lucene.russian.morphology;

View File

@ -1,7 +1,23 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology; package org.apache.lucene.russian.morphology;
public class WrongCharaterException extends RuntimeException{ public class WrongCharaterException extends RuntimeException {
public WrongCharaterException() { public WrongCharaterException() {
} }

View File

@ -1,15 +1,31 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.analayzer; package org.apache.lucene.russian.morphology.analayzer;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.Reader;
import java.io.IOException; import java.io.IOException;
import java.io.Reader;
public class RussianMorphlogyAnalayzer extends Analyzer { public class RussianMorphlogyAnalayzer extends Analyzer {
private SuffixEvristics suffixEvristics; private SuffixEvristics suffixEvristics;
public RussianMorphlogyAnalayzer() throws IOException { public RussianMorphlogyAnalayzer() throws IOException {
@ -20,6 +36,6 @@ public class RussianMorphlogyAnalayzer extends Analyzer {
TokenStream result = new StandardTokenizer(reader); TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result); result = new StandardFilter(result);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(result);
return new RussianMorphlogyFilter(result,suffixEvristics); return new RussianMorphlogyFilter(result, suffixEvristics);
} }
} }

View File

@ -1,3 +1,19 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.analayzer; package org.apache.lucene.russian.morphology.analayzer;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Token;
@ -17,11 +33,11 @@ public class RussianMorphlogyFilter extends TokenFilter {
public Token next(final Token reusableToken) throws IOException { public Token next(final Token reusableToken) throws IOException {
Token nextToken = input.next(reusableToken); Token nextToken = input.next(reusableToken);
if(nextToken == null || nextToken.term().length() == 0) return nextToken; if (nextToken == null || nextToken.term().length() == 0) return nextToken;
String word = nextToken.term(); String word = nextToken.term();
Character testC = word.charAt(0); Character testC = word.charAt(0);
if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC){ if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC) {
return nextToken; return nextToken;
} }
Token current = (Token) nextToken.clone(); Token current = (Token) nextToken.clone();
return createToken(suffixEvristics.getCanonicalForm(word), current, reusableToken); return createToken(suffixEvristics.getCanonicalForm(word), current, reusableToken);

View File

@ -1,10 +1,25 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.analayzer; package org.apache.lucene.russian.morphology.analayzer;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import java.io.*; import java.io.*;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashSet;
public class SuffixEvristics { public class SuffixEvristics {
@ -46,23 +61,23 @@ public class SuffixEvristics {
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0; int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
String suffixS = form.substring(startSymbol); String suffixS = form.substring(startSymbol);
if(!chechSuffix(suffixS)) return form; if (!chechSuffix(suffixS)) return form;
Long suffix = RussianSuffixDecoderEncoder.encode(suffixS); Long suffix = RussianSuffixDecoderEncoder.encode(suffixS);
int index = Arrays.binarySearch(keys,suffix); int index = Arrays.binarySearch(keys, suffix);
if(index < -1){ if (index < -1) {
System.out.println(" " + form); System.out.println(" " + form);
return form; return form;
}else{ } else {
String nSuffix = RussianSuffixDecoderEncoder.decode(values[index]); String nSuffix = RussianSuffixDecoderEncoder.decode(values[index]);
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix; return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
} }
} }
private boolean chechSuffix(String suffix){ private boolean chechSuffix(String suffix) {
for(int i = 0; i < suffix.length(); i++){ for (int i = 0; i < suffix.length(); i++) {
if (!RussianSuffixDecoderEncoder.checkCharacter(suffix.charAt(i))) return false; if (!RussianSuffixDecoderEncoder.checkCharacter(suffix.charAt(i))) return false;
} }
return true; return true;

View File

@ -1,9 +1,28 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.dictonary; package org.apache.lucene.russian.morphology.dictonary;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.*; import java.util.*;
import java.io.*;
/** /**
@ -15,7 +34,7 @@ public class DictonaryReader {
private String fileEncoding = "windows-1251"; private String fileEncoding = "windows-1251";
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>(); private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
private List<List<String>> wordPrefixes = new ArrayList<List<String>>(); private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
private Set<String> ingnoredForm = new HashSet<String>(); private Set<String> ingnoredForm = new HashSet<String>();
public DictonaryReader(String fileName, Set<String> ingnoredForm) { public DictonaryReader(String fileName, Set<String> ingnoredForm) {
this.fileName = fileName; this.fileName = fileName;
@ -35,11 +54,11 @@ public class DictonaryReader {
sckipBlock(bufferedReader); sckipBlock(bufferedReader);
sckipBlock(bufferedReader); sckipBlock(bufferedReader);
readPrefix(bufferedReader); readPrefix(bufferedReader);
readWords(bufferedReader,wordProccessor); readWords(bufferedReader, wordProccessor);
} }
private void readWords(BufferedReader reader,WordProccessor wordProccessor) throws IOException { private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
String s = reader.readLine(); String s = reader.readLine();
int count = Integer.valueOf(s); int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
@ -54,15 +73,15 @@ public class DictonaryReader {
if (models.size() > 0 && !ingnoredForm.contains(models.get(0).getCode())) { if (models.size() > 0 && !ingnoredForm.contains(models.get(0).getCode())) {
WordCard card = new WordCard(cleanString(models.get(0).create(word))); WordCard card = new WordCard(cleanString(models.get(0).create(word)));
for (FlexiaModel fm : models) { for (FlexiaModel fm : models) {
card.addFrom(cleanString(fm.create(word))); card.addFrom(cleanString(fm.create(word)));
} }
wordProccessor.proccess(card); wordProccessor.proccess(card);
} }
} }
} }
private String cleanString(String s){ private String cleanString(String s) {
return s.replace((char)(34 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET),(char)(6 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET)); return s.replace((char) (34 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
} }
private void sckipBlock(BufferedReader reader) throws IOException { private void sckipBlock(BufferedReader reader) throws IOException {
@ -99,8 +118,8 @@ public class DictonaryReader {
private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) { private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
String[] fl = line.split("\\*"); String[] fl = line.split("\\*");
// we inored all forms thats // we inored all forms thats
// if (fl.length == 3) // if (fl.length == 3)
// flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase())); // flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), "")); if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
} }

View File

@ -1,3 +1,19 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.dictonary; package org.apache.lucene.russian.morphology.dictonary;
/** /**

View File

@ -1,11 +1,27 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.dictonary; package org.apache.lucene.russian.morphology.dictonary;
import java.util.Set;
import java.util.HashSet;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Set;
public class IgnoredFormReader { public class IgnoredFormReader {

View File

@ -1,7 +1,23 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.dictonary; package org.apache.lucene.russian.morphology.dictonary;
import java.util.List;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List;
/** /**
* Represent word and all it forms. * Represent word and all it forms.
@ -14,7 +30,7 @@ public class WordCard {
this.canonicalFrom = canonicalFrom; this.canonicalFrom = canonicalFrom;
} }
protected void addFrom(String word){ protected void addFrom(String word) {
wordsFroms.add(word); wordsFroms.add(word);
} }

View File

@ -1,3 +1,19 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.dictonary; package org.apache.lucene.russian.morphology.dictonary;
import java.io.IOException; import java.io.IOException;

View File

@ -1,9 +1,28 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.evristics; package org.apache.lucene.russian.morphology.evristics;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import java.util.*; import java.io.BufferedReader;
import java.io.*; import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.TreeMap;
public class Evristic { public class Evristic {
@ -35,7 +54,7 @@ public class Evristic {
String s = reader.readLine(); String s = reader.readLine();
while (s != null) { while (s != null) {
String[] sfns = s.split(" "); String[] sfns = s.split(" ");
if(sfns.length == 2){ if (sfns.length == 2) {
encodedSuffixesPairs.put(Long.valueOf(sfns[0]), Long.valueOf(sfns[0])); encodedSuffixesPairs.put(Long.valueOf(sfns[0]), Long.valueOf(sfns[0]));
} }
s = reader.readLine(); s = reader.readLine();
@ -45,8 +64,8 @@ public class Evristic {
public void writeToFile(String file) throws IOException { public void writeToFile(String file) throws IOException {
FileWriter writer = new FileWriter(file); FileWriter writer = new FileWriter(file);
writer.write(encodedSuffixesPairs.size()+"\n"); writer.write(encodedSuffixesPairs.size() + "\n");
for(Long k:encodedSuffixesPairs.keySet()){ for (Long k : encodedSuffixesPairs.keySet()) {
writer.write("" + k + " " + encodedSuffixesPairs.get(k) + "\n"); writer.write("" + k + " " + encodedSuffixesPairs.get(k) + "\n");
} }
writer.close(); writer.close();

View File

@ -1,26 +1,42 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.evristics; package org.apache.lucene.russian.morphology.evristics;
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
import org.apache.lucene.russian.morphology.dictonary.WordCard;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import org.apache.lucene.russian.morphology.dictonary.WordCard;
import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
import java.util.Map;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map;
public class StatiticsCollectors implements WordProccessor{ public class StatiticsCollectors implements WordProccessor {
Map<SuffixEvristic,SuffixCounter> statititics = new HashMap<SuffixEvristic,SuffixCounter>(); Map<SuffixEvristic, SuffixCounter> statititics = new HashMap<SuffixEvristic, SuffixCounter>();
private Integer ignoredCount = 0; private Integer ignoredCount = 0;
public void proccess(WordCard wordCard) { public void proccess(WordCard wordCard) {
for(String form:wordCard.getWordsFroms()){ for (String form : wordCard.getWordsFroms()) {
SuffixEvristic suffixEvristic = createEvristic(wordCard.getCanonicalFrom(), form); SuffixEvristic suffixEvristic = createEvristic(wordCard.getCanonicalFrom(), form);
if (suffixEvristic == null) continue; if (suffixEvristic == null) continue;
SuffixCounter suffixCounter = statititics.get(suffixEvristic); SuffixCounter suffixCounter = statititics.get(suffixEvristic);
if(suffixCounter == null){ if (suffixCounter == null) {
suffixCounter = new SuffixCounter(suffixEvristic); suffixCounter = new SuffixCounter(suffixEvristic);
statititics.put(suffixEvristic,suffixCounter); statititics.put(suffixEvristic, suffixCounter);
} }
suffixCounter.incrementAmount(); suffixCounter.incrementAmount();
} }
@ -30,19 +46,19 @@ public class StatiticsCollectors implements WordProccessor{
return statititics; return statititics;
} }
private SuffixEvristic createEvristic(String word,String form){ private SuffixEvristic createEvristic(String word, String form) {
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0; int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
String formSuffix = form.substring(startSymbol); String formSuffix = form.substring(startSymbol);
if(word.length() < startSymbol){ if (word.length() < startSymbol) {
ignoredCount++; ignoredCount++;
return null; return null;
} }
String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : ""; String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : "";
if (wordSuffix.length() > 12){ if (wordSuffix.length() > 12) {
System.out.println(word + " " + form); System.out.println(word + " " + form);
return null; return null;
} }
return new SuffixEvristic(formSuffix,wordSuffix); return new SuffixEvristic(formSuffix, wordSuffix);
} }

View File

@ -1,10 +1,26 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.evristics; package org.apache.lucene.russian.morphology.evristics;
/** /**
* Conains information of freqency of suffix evristic * Conains information of freqency of suffix evristic
* in dictionary. * in dictionary.
*/ */
public class SuffixCounter implements Comparable{ public class SuffixCounter implements Comparable {
private SuffixEvristic suffixEvristic; private SuffixEvristic suffixEvristic;
private Double amnout = 0.0; private Double amnout = 0.0;
@ -12,7 +28,7 @@ public class SuffixCounter implements Comparable{
this.suffixEvristic = suffixEvristic; this.suffixEvristic = suffixEvristic;
} }
public void incrementAmount(){ public void incrementAmount() {
amnout++; amnout++;
} }
@ -33,12 +49,12 @@ public class SuffixCounter implements Comparable{
} }
public int compareTo(Object o) { public int compareTo(Object o) {
if(o instanceof SuffixCounter) return (int) Math.round(Math.signum(((SuffixCounter)o).amnout - amnout)); if (o instanceof SuffixCounter) return (int) Math.round(Math.signum(((SuffixCounter) o).amnout - amnout));
return -1; return -1;
} }
@Override @Override
public String toString() { public String toString() {
return ""+amnout + " " + suffixEvristic.toString(); return "" + amnout + " " + suffixEvristic.toString();
} }
} }

View File

@ -1,3 +1,19 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.evristics; package org.apache.lucene.russian.morphology.evristics;
/** /**

View File

@ -1,39 +1,54 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology; package org.apache.lucene.russian.morphology;
import org.junit.Test;
import static org.junit.Assert.assertThat;
import static org.hamcrest.core.IsEqual.equalTo; import static org.hamcrest.core.IsEqual.equalTo;
import org.apache.lucene.russian.morphology.SuffixToLongException; import static org.junit.Assert.assertThat;
import org.junit.Test;
import java.io.InputStream;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
public class RussianSuffixDecoderEncoderTest { public class RussianSuffixDecoderEncoderTest {
@Test @Test
public void testShouldCorretDecodeEncode() throws IOException { public void testShouldCorretDecodeEncode() throws IOException {
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/decoder-test-data.txt"); InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/decoder-test-data.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8")); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
String s = bufferedReader.readLine(); String s = bufferedReader.readLine();
while(s != null){ while (s != null) {
String[] qa = s.trim().split(" "); String[] qa = s.trim().split(" ");
Long ecodedSuffix = RussianSuffixDecoderEncoder.encode(qa[0]); Long ecodedSuffix = RussianSuffixDecoderEncoder.encode(qa[0]);
assertThat(RussianSuffixDecoderEncoder.decode(ecodedSuffix),equalTo(qa[1])); assertThat(RussianSuffixDecoderEncoder.decode(ecodedSuffix), equalTo(qa[1]));
s = bufferedReader.readLine(); s = bufferedReader.readLine();
} }
} }
@Test(expected = SuffixToLongException.class) @Test(expected = SuffixToLongException.class)
public void shouldThrownExeptionIfSuffixToLong(){ public void shouldThrownExeptionIfSuffixToLong() {
RussianSuffixDecoderEncoder.encode("1234567890123"); RussianSuffixDecoderEncoder.encode("1234567890123");
} }
@Test(expected = WrongCharaterException.class) @Test(expected = WrongCharaterException.class)
public void shouldThrownExeptionIfSuffixContainWrongCharater(){ public void shouldThrownExeptionIfSuffixContainWrongCharater() {
RussianSuffixDecoderEncoder.encode("1"); RussianSuffixDecoderEncoder.encode("1");
} }
} }

View File

@ -1,13 +1,28 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.analayzer; package org.apache.lucene.russian.morphology.analayzer;
import junit.framework.TestCase;
import org.junit.Test;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.junit.Test;
import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.BufferedReader;
import java.io.InputStreamReader; import java.io.InputStreamReader;
@ -17,24 +32,22 @@ public class RussianMorphlogyAnalayzerTest {
public void shouldCorrectProccessText() throws IOException { public void shouldCorrectProccessText() throws IOException {
RussianMorphlogyAnalayzer morphlogyAnalayzer = new RussianMorphlogyAnalayzer(); RussianMorphlogyAnalayzer morphlogyAnalayzer = new RussianMorphlogyAnalayzer();
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt"); InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8")); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
final Token reusableToken = new Token(); final Token reusableToken = new Token();
Token nextToken; Token nextToken;
TokenStream in = morphlogyAnalayzer.tokenStream(null, bufferedReader); TokenStream in = morphlogyAnalayzer.tokenStream(null, bufferedReader);
for (;;) for (; ;) {
{ nextToken = in.next(reusableToken);
nextToken = in.next(reusableToken);
if (nextToken == null) if (nextToken == null) {
{ break;
break; }
}
System.out.println(nextToken.term()); System.out.println(nextToken.term());
// nextSampleToken = sample.next(reusableSampleToken); // nextSampleToken = sample.next(reusableSampleToken);
// assertEquals( // assertEquals(
// "Unicode", // "Unicode",
@ -42,7 +55,7 @@ public class RussianMorphlogyAnalayzerTest {
// nextSampleToken == null // nextSampleToken == null
// ? null // ? null
// : nextSampleToken.term()); // : nextSampleToken.term());
} }
} }
} }

View File

@ -1,24 +1,42 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.russian.morphology.analayzer; package org.apache.lucene.russian.morphology.analayzer;
import org.junit.Test;
import static org.junit.Assert.assertThat;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import static org.hamcrest.core.IsEqual.equalTo; import static org.hamcrest.core.IsEqual.equalTo;
import static org.junit.Assert.assertThat;
import org.junit.Test;
import java.io.*; import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
public class SuffixEvristicsTest { public class SuffixEvristicsTest {
@Test @Test
public void testShouldDefineCorretCononicalWordForm() throws IOException { public void testShouldDefineCorretCononicalWordForm() throws IOException {
SuffixEvristics suffixEvristics = new SuffixEvristics(); SuffixEvristics suffixEvristics = new SuffixEvristics();
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt"); InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8")); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
String s = bufferedReader.readLine(); String s = bufferedReader.readLine();
while(s != null){ while (s != null) {
String[] qa = s.trim().split(" "); String[] qa = s.trim().split(" ");
assertThat(suffixEvristics.getCanonicalForm(qa[0]),equalTo(qa[1])); assertThat(suffixEvristics.getCanonicalForm(qa[0]), equalTo(qa[1]));
s = bufferedReader.readLine(); s = bufferedReader.readLine();
} }
} }

View File

@ -1,5 +1,3 @@
В условиях нарастающей пурги было сделано 4 успешных захода на посадку. После завершения облета и демонтажа оборудования В условиях нарастающей пурги было сделано 4 успешных захода на посадку. "Все нормально, будем рекомендовать систему к внедрению".
Рубен Есаян дал устную оценку эксперимента:"Все нормально, будем рекомендовать систему к внедрению". Рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются
Летом - с ноября по март - рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются примерно один раз в две недели.
примерно один раз в две недели. Туда привозят людей, питание, оборудование, ГСМ и т.д.
что-то