From 3f26888bdeab38005d2f99bcd7009554000e0a61 Mon Sep 17 00:00:00 2001 From: "alexander.a.kuznetsov" Date: Tue, 14 Apr 2009 07:47:27 +0000 Subject: [PATCH] adding licence git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@14 d817d54c-26ab-11de-abc9-2f7d1455ff7a --- etc/header.txt | 13 +++ pom.xml | 110 +++++++++++++++++- src/main/java/org/apache/lucene/App.java | 13 --- .../russian/morphology/EvristicBuilder.java | 29 ++++- .../RussianSuffixDecoderEncoder.java | 32 +++-- .../morphology/SuffixToLongException.java | 16 +++ .../morphology/WrongCharaterException.java | 18 ++- .../analayzer/RussianMorphlogyAnalayzer.java | 26 ++++- .../analayzer/RussianMorphlogyFilter.java | 22 +++- .../morphology/analayzer/SuffixEvristics.java | 29 +++-- .../morphology/dictonary/DictonaryReader.java | 37 ++++-- .../morphology/dictonary/FlexiaModel.java | 16 +++ .../dictonary/IgnoredFormReader.java | 22 +++- .../morphology/dictonary/WordCard.java | 20 +++- .../morphology/dictonary/WordProccessor.java | 18 ++- .../morphology/evristics/Evristic.java | 29 ++++- .../evristics/StatiticsCollectors.java | 42 ++++--- .../morphology/evristics/SuffixCounter.java | 26 ++++- .../morphology/evristics/SuffixEvristic.java | 16 +++ .../RussianSuffixDecoderEncoderTest.java | 47 +++++--- .../RussianMorphlogyAnalayzerTest.java | 43 ++++--- .../analayzer/SuffixEvristicsTest.java | 38 ++++-- .../morphology/analayzer/russian-text.txt | 8 +- 23 files changed, 541 insertions(+), 129 deletions(-) create mode 100644 etc/header.txt delete mode 100644 src/main/java/org/apache/lucene/App.java diff --git a/etc/header.txt b/etc/header.txt new file mode 100644 index 0000000..76f2dc1 --- /dev/null +++ b/etc/header.txt @@ -0,0 +1,13 @@ +Copyright 2009 Alexander Kuznetsov + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/pom.xml b/pom.xml index 4a32899..2118b34 100644 --- a/pom.xml +++ b/pom.xml @@ -4,9 +4,27 @@ org.apache.lucene russian-morpholgy jar - 1.0-SNAPSHOT + 0.5-SNAPSHOT russian-morpholgy http://maven.apache.org + + + + russian-morpholgy + + ../repo/releases + + + + russian-morpholgy-snapshots + + ../repo/snapshots + + true + + + + junit @@ -29,8 +47,68 @@ - + + + maven2-repository.dev.java.net + Java.net Repository for Maven + http://download.java.net/maven/2/ + + + + + + mc-release + maven-license-plugin repository of releases + http://mc-repo.googlecode.com/svn/maven2/releases + + + false + + + + true + + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + + + org.codehaus.mojo + cobertura-maven-plugin + + + + + org.apache.maven.plugins + maven-pmd-plugin + + true + utf-8 + 100 + 1.5 + + + + + + + + + org.jvnet.wagon-svn + wagon-svn + 1.8 + + + + + org.apache.maven.plugins maven-compiler-plugin @@ -39,6 +117,34 @@ 1.5 + + + maven-license-plugin + com.mathieucarbou.mojo + + + ${project.parent.basedir} +
etc/header.txt
+ + **/*.txt + + + **/src/** + **/pom.xml + +
+ + + + test + + check + + + +
diff --git a/src/main/java/org/apache/lucene/App.java b/src/main/java/org/apache/lucene/App.java deleted file mode 100644 index a566430..0000000 --- a/src/main/java/org/apache/lucene/App.java +++ /dev/null @@ -1,13 +0,0 @@ -package org.apache.lucene; - -/** - * Hello world! - * - */ -public class App -{ - public static void main( String[] args ) - { - System.out.println( "Hello World!" ); - } -} diff --git a/src/main/java/org/apache/lucene/russian/morphology/EvristicBuilder.java b/src/main/java/org/apache/lucene/russian/morphology/EvristicBuilder.java index 3633708..0f08828 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/EvristicBuilder.java +++ b/src/main/java/org/apache/lucene/russian/morphology/EvristicBuilder.java @@ -1,14 +1,31 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.lucene.russian.morphology; import org.apache.lucene.russian.morphology.dictonary.DictonaryReader; import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader; +import org.apache.lucene.russian.morphology.evristics.Evristic; import org.apache.lucene.russian.morphology.evristics.StatiticsCollectors; import org.apache.lucene.russian.morphology.evristics.SuffixCounter; -import org.apache.lucene.russian.morphology.evristics.Evristic; -import java.io.*; -import java.util.*; -import java.util.concurrent.atomic.AtomicInteger; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.Set; public class EvristicBuilder { @@ -23,12 +40,12 @@ public class EvristicBuilder { Object[] objects = counterCollection.toArray(); Arrays.sort(objects); System.out.println("Length " + objects.length + " ingored words " + statiticsCollectors.getIgnoredCount()); - for(int i = 0; i < 10; i++){ + for (int i = 0; i < 10; i++) { System.out.println(objects[i]); } final Evristic evristic = new Evristic(); - for(int i = 0; i < objects.length; i++){ + for (int i = 0; i < objects.length; i++) { evristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic()); } diff --git a/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java b/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java index 0ff57af..985ce5e 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java +++ b/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java @@ -1,3 +1,19 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.lucene.russian.morphology; /** @@ -24,7 +40,7 @@ public class RussianSuffixDecoderEncoder { c = DASH_CODE; } if (c == EE_CHAR) c = E_CHAR; - if (c < 0 || c > 33) throw new WrongCharaterException(); + if (c < 0 || c > 33) throw new WrongCharaterException(); result = result * 35L + c; } return result; @@ -44,12 +60,12 @@ public class RussianSuffixDecoderEncoder { return result; } - static public boolean checkCharacter(char c){ - int code = 0 + c; - if(code == 45) return true; - code -= RUSSIAN_SMALL_LETTER_OFFSET; - if(code == 34) return true; - if(code > 0 && code < 33) return true; - return false; + static public boolean checkCharacter(char c) { + int code = 0 + c; + if (code == 45) return true; + code -= RUSSIAN_SMALL_LETTER_OFFSET; + if (code == 34) return true; + if (code > 0 && code < 33) return true; + return false; } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/SuffixToLongException.java b/src/main/java/org/apache/lucene/russian/morphology/SuffixToLongException.java index eaaebfe..568ba05 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/SuffixToLongException.java +++ b/src/main/java/org/apache/lucene/russian/morphology/SuffixToLongException.java @@ -1,3 +1,19 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.lucene.russian.morphology; diff --git a/src/main/java/org/apache/lucene/russian/morphology/WrongCharaterException.java b/src/main/java/org/apache/lucene/russian/morphology/WrongCharaterException.java index 830fb4b..e37c690 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/WrongCharaterException.java +++ b/src/main/java/org/apache/lucene/russian/morphology/WrongCharaterException.java @@ -1,7 +1,23 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.lucene.russian.morphology; -public class WrongCharaterException extends RuntimeException{ +public class WrongCharaterException extends RuntimeException { public WrongCharaterException() { } diff --git a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzer.java b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzer.java index 3337239..d073ca5 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzer.java +++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzer.java @@ -1,15 +1,31 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.lucene.russian.morphology.analayzer; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.LowerCaseFilter; -import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; -import java.io.Reader; import java.io.IOException; +import java.io.Reader; -public class RussianMorphlogyAnalayzer extends Analyzer { +public class RussianMorphlogyAnalayzer extends Analyzer { private SuffixEvristics suffixEvristics; public RussianMorphlogyAnalayzer() throws IOException { @@ -20,6 +36,6 @@ public class RussianMorphlogyAnalayzer extends Analyzer { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); - return new RussianMorphlogyFilter(result,suffixEvristics); + return new RussianMorphlogyFilter(result, suffixEvristics); } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java index b324064..28172c6 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java +++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java @@ -1,3 +1,19 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.lucene.russian.morphology.analayzer; import org.apache.lucene.analysis.Token; @@ -17,11 +33,11 @@ public class RussianMorphlogyFilter extends TokenFilter { public Token next(final Token reusableToken) throws IOException { Token nextToken = input.next(reusableToken); - if(nextToken == null || nextToken.term().length() == 0) return nextToken; + if (nextToken == null || nextToken.term().length() == 0) return nextToken; String word = nextToken.term(); Character testC = word.charAt(0); - if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC){ - return nextToken; + if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC) { + return nextToken; } Token current = (Token) nextToken.clone(); return createToken(suffixEvristics.getCanonicalForm(word), current, reusableToken); diff --git a/src/main/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristics.java b/src/main/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristics.java index e593511..90c8c46 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristics.java +++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristics.java @@ -1,10 +1,25 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.lucene.russian.morphology.analayzer; import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; import java.io.*; import java.util.Arrays; -import java.util.HashSet; public class SuffixEvristics { @@ -46,23 +61,23 @@ public class SuffixEvristics { int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0; String suffixS = form.substring(startSymbol); - if(!chechSuffix(suffixS)) return form; + if (!chechSuffix(suffixS)) return form; Long suffix = RussianSuffixDecoderEncoder.encode(suffixS); - int index = Arrays.binarySearch(keys,suffix); - if(index < -1){ + int index = Arrays.binarySearch(keys, suffix); + if (index < -1) { System.out.println(" " + form); return form; - }else{ + } else { String nSuffix = RussianSuffixDecoderEncoder.decode(values[index]); return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix; } } - private boolean chechSuffix(String suffix){ - for(int i = 0; i < suffix.length(); i++){ + private boolean chechSuffix(String suffix) { + for (int i = 0; i < suffix.length(); i++) { if (!RussianSuffixDecoderEncoder.checkCharacter(suffix.charAt(i))) return false; } return true; diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java index bbf5679..42a2b11 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java +++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java @@ -1,9 +1,28 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.lucene.russian.morphology.dictonary; import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; import java.util.*; -import java.io.*; /** @@ -15,7 +34,7 @@ public class DictonaryReader { private String fileEncoding = "windows-1251"; private List> wordsFlexias = new ArrayList>(); private List> wordPrefixes = new ArrayList>(); - private Set ingnoredForm = new HashSet(); + private Set ingnoredForm = new HashSet(); public DictonaryReader(String fileName, Set ingnoredForm) { this.fileName = fileName; @@ -35,11 +54,11 @@ public class DictonaryReader { sckipBlock(bufferedReader); sckipBlock(bufferedReader); readPrefix(bufferedReader); - readWords(bufferedReader,wordProccessor); + readWords(bufferedReader, wordProccessor); } - private void readWords(BufferedReader reader,WordProccessor wordProccessor) throws IOException { + private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException { String s = reader.readLine(); int count = Integer.valueOf(s); for (int i = 0; i < count; i++) { @@ -54,15 +73,15 @@ public class DictonaryReader { if (models.size() > 0 && !ingnoredForm.contains(models.get(0).getCode())) { WordCard card = new WordCard(cleanString(models.get(0).create(word))); for (FlexiaModel fm : models) { - card.addFrom(cleanString(fm.create(word))); + card.addFrom(cleanString(fm.create(word))); } wordProccessor.proccess(card); } } } - private String cleanString(String s){ - return s.replace((char)(34 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET),(char)(6 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET)); + private String cleanString(String s) { + return s.replace((char) (34 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET)); } private void sckipBlock(BufferedReader reader) throws IOException { @@ -99,8 +118,8 @@ public class DictonaryReader { private void addFlexia(ArrayList flexiaModelArrayList, String line) { String[] fl = line.split("\\*"); // we inored all forms thats - // if (fl.length == 3) - // flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase())); + // if (fl.length == 3) + // flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase())); if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), "")); } diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java index e983311..3ae24e1 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java +++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java @@ -1,3 +1,19 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.lucene.russian.morphology.dictonary; /** diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/IgnoredFormReader.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/IgnoredFormReader.java index a6568c0..28100b5 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/IgnoredFormReader.java +++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/IgnoredFormReader.java @@ -1,11 +1,27 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.lucene.russian.morphology.dictonary; -import java.util.Set; -import java.util.HashSet; import java.io.BufferedReader; -import java.io.InputStreamReader; import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStreamReader; +import java.util.HashSet; +import java.util.Set; public class IgnoredFormReader { diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java index a37b107..770bca3 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java +++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java @@ -1,7 +1,23 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.lucene.russian.morphology.dictonary; -import java.util.List; import java.util.ArrayList; +import java.util.List; /** * Represent word and all it forms. @@ -14,7 +30,7 @@ public class WordCard { this.canonicalFrom = canonicalFrom; } - protected void addFrom(String word){ + protected void addFrom(String word) { wordsFroms.add(word); } diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java index 3f93f43..37e769a 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java +++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java @@ -1,9 +1,25 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.lucene.russian.morphology.dictonary; import java.io.IOException; /** - * Interface allows get information from + * Interface allows get information from * {@org.apache.lucene.russian.morphology.dictonary.DirtonaryReader}. */ public interface WordProccessor { diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java index 79ce5f4..2ef8a15 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java +++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java @@ -1,9 +1,28 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.lucene.russian.morphology.evristics; import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; -import java.util.*; -import java.io.*; +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.util.TreeMap; public class Evristic { @@ -35,7 +54,7 @@ public class Evristic { String s = reader.readLine(); while (s != null) { String[] sfns = s.split(" "); - if(sfns.length == 2){ + if (sfns.length == 2) { encodedSuffixesPairs.put(Long.valueOf(sfns[0]), Long.valueOf(sfns[0])); } s = reader.readLine(); @@ -45,8 +64,8 @@ public class Evristic { public void writeToFile(String file) throws IOException { FileWriter writer = new FileWriter(file); - writer.write(encodedSuffixesPairs.size()+"\n"); - for(Long k:encodedSuffixesPairs.keySet()){ + writer.write(encodedSuffixesPairs.size() + "\n"); + for (Long k : encodedSuffixesPairs.keySet()) { writer.write("" + k + " " + encodedSuffixesPairs.get(k) + "\n"); } writer.close(); diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java index e3271ff..55f6cae 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java +++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java @@ -1,26 +1,42 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.lucene.russian.morphology.evristics; -import org.apache.lucene.russian.morphology.dictonary.WordProccessor; -import org.apache.lucene.russian.morphology.dictonary.WordCard; import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; +import org.apache.lucene.russian.morphology.dictonary.WordCard; +import org.apache.lucene.russian.morphology.dictonary.WordProccessor; -import java.util.Map; import java.util.HashMap; +import java.util.Map; -public class StatiticsCollectors implements WordProccessor{ - Map statititics = new HashMap(); +public class StatiticsCollectors implements WordProccessor { + Map statititics = new HashMap(); private Integer ignoredCount = 0; public void proccess(WordCard wordCard) { - for(String form:wordCard.getWordsFroms()){ + for (String form : wordCard.getWordsFroms()) { SuffixEvristic suffixEvristic = createEvristic(wordCard.getCanonicalFrom(), form); if (suffixEvristic == null) continue; SuffixCounter suffixCounter = statititics.get(suffixEvristic); - if(suffixCounter == null){ + if (suffixCounter == null) { suffixCounter = new SuffixCounter(suffixEvristic); - statititics.put(suffixEvristic,suffixCounter); + statititics.put(suffixEvristic, suffixCounter); } suffixCounter.incrementAmount(); } @@ -30,19 +46,19 @@ public class StatiticsCollectors implements WordProccessor{ return statititics; } - private SuffixEvristic createEvristic(String word,String form){ + private SuffixEvristic createEvristic(String word, String form) { int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0; String formSuffix = form.substring(startSymbol); - if(word.length() < startSymbol){ + if (word.length() < startSymbol) { ignoredCount++; - return null; + return null; } String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : ""; - if (wordSuffix.length() > 12){ + if (wordSuffix.length() > 12) { System.out.println(word + " " + form); return null; } - return new SuffixEvristic(formSuffix,wordSuffix); + return new SuffixEvristic(formSuffix, wordSuffix); } diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java index 65462a8..11401a8 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java +++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java @@ -1,10 +1,26 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.lucene.russian.morphology.evristics; /** * Conains information of freqency of suffix evristic - * in dictionary. + * in dictionary. */ -public class SuffixCounter implements Comparable{ +public class SuffixCounter implements Comparable { private SuffixEvristic suffixEvristic; private Double amnout = 0.0; @@ -12,7 +28,7 @@ public class SuffixCounter implements Comparable{ this.suffixEvristic = suffixEvristic; } - public void incrementAmount(){ + public void incrementAmount() { amnout++; } @@ -33,12 +49,12 @@ public class SuffixCounter implements Comparable{ } public int compareTo(Object o) { - if(o instanceof SuffixCounter) return (int) Math.round(Math.signum(((SuffixCounter)o).amnout - amnout)); + if (o instanceof SuffixCounter) return (int) Math.round(Math.signum(((SuffixCounter) o).amnout - amnout)); return -1; } @Override public String toString() { - return ""+amnout + " " + suffixEvristic.toString(); + return "" + amnout + " " + suffixEvristic.toString(); } } diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java index ab5e1df..cc4621d 100644 --- a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java +++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java @@ -1,3 +1,19 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.lucene.russian.morphology.evristics; /** diff --git a/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java b/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java index bac6fc7..bf77d12 100644 --- a/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java +++ b/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java @@ -1,39 +1,54 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.lucene.russian.morphology; -import org.junit.Test; -import static org.junit.Assert.assertThat; import static org.hamcrest.core.IsEqual.equalTo; -import org.apache.lucene.russian.morphology.SuffixToLongException; +import static org.junit.Assert.assertThat; +import org.junit.Test; -import java.io.InputStream; import java.io.BufferedReader; -import java.io.InputStreamReader; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; public class RussianSuffixDecoderEncoderTest { @Test public void testShouldCorretDecodeEncode() throws IOException { - InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/decoder-test-data.txt"); - BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8")); - String s = bufferedReader.readLine(); - while(s != null){ + InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/decoder-test-data.txt"); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); + String s = bufferedReader.readLine(); + while (s != null) { String[] qa = s.trim().split(" "); Long ecodedSuffix = RussianSuffixDecoderEncoder.encode(qa[0]); - assertThat(RussianSuffixDecoderEncoder.decode(ecodedSuffix),equalTo(qa[1])); + assertThat(RussianSuffixDecoderEncoder.decode(ecodedSuffix), equalTo(qa[1])); s = bufferedReader.readLine(); } } @Test(expected = SuffixToLongException.class) - public void shouldThrownExeptionIfSuffixToLong(){ - RussianSuffixDecoderEncoder.encode("1234567890123"); + public void shouldThrownExeptionIfSuffixToLong() { + RussianSuffixDecoderEncoder.encode("1234567890123"); } @Test(expected = WrongCharaterException.class) - public void shouldThrownExeptionIfSuffixContainWrongCharater(){ - RussianSuffixDecoderEncoder.encode("1"); - } - + public void shouldThrownExeptionIfSuffixContainWrongCharater() { + RussianSuffixDecoderEncoder.encode("1"); + } + } diff --git a/src/test/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzerTest.java b/src/test/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzerTest.java index 899d65c..61c8d59 100644 --- a/src/test/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzerTest.java +++ b/src/test/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzerTest.java @@ -1,13 +1,28 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.lucene.russian.morphology.analayzer; -import junit.framework.TestCase; -import org.junit.Test; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.junit.Test; +import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; -import java.io.BufferedReader; import java.io.InputStreamReader; @@ -17,24 +32,22 @@ public class RussianMorphlogyAnalayzerTest { public void shouldCorrectProccessText() throws IOException { RussianMorphlogyAnalayzer morphlogyAnalayzer = new RussianMorphlogyAnalayzer(); InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt"); - BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8")); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); final Token reusableToken = new Token(); - Token nextToken; + Token nextToken; TokenStream in = morphlogyAnalayzer.tokenStream(null, bufferedReader); - for (;;) - { - nextToken = in.next(reusableToken); + for (; ;) { + nextToken = in.next(reusableToken); - if (nextToken == null) - { - break; - } + if (nextToken == null) { + break; + } - System.out.println(nextToken.term()); + System.out.println(nextToken.term()); // nextSampleToken = sample.next(reusableSampleToken); // assertEquals( // "Unicode", @@ -42,7 +55,7 @@ public class RussianMorphlogyAnalayzerTest { // nextSampleToken == null // ? null // : nextSampleToken.term()); - } + } } } diff --git a/src/test/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristicsTest.java b/src/test/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristicsTest.java index 0d6e367..7191853 100644 --- a/src/test/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristicsTest.java +++ b/src/test/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristicsTest.java @@ -1,24 +1,42 @@ +/** + * Copyright 2009 Alexander Kuznetsov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.lucene.russian.morphology.analayzer; -import org.junit.Test; -import static org.junit.Assert.assertThat; -import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder; import static org.hamcrest.core.IsEqual.equalTo; +import static org.junit.Assert.assertThat; +import org.junit.Test; -import java.io.*; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; public class SuffixEvristicsTest { @Test public void testShouldDefineCorretCononicalWordForm() throws IOException { - SuffixEvristics suffixEvristics = new SuffixEvristics(); - InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt"); - BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8")); - String s = bufferedReader.readLine(); - while(s != null){ + SuffixEvristics suffixEvristics = new SuffixEvristics(); + InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt"); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); + String s = bufferedReader.readLine(); + while (s != null) { String[] qa = s.trim().split(" "); - assertThat(suffixEvristics.getCanonicalForm(qa[0]),equalTo(qa[1])); + assertThat(suffixEvristics.getCanonicalForm(qa[0]), equalTo(qa[1])); s = bufferedReader.readLine(); } } diff --git a/src/test/resources/org/apache/lucene/russian/morphology/analayzer/russian-text.txt b/src/test/resources/org/apache/lucene/russian/morphology/analayzer/russian-text.txt index ce77ba2..88e3e54 100644 --- a/src/test/resources/org/apache/lucene/russian/morphology/analayzer/russian-text.txt +++ b/src/test/resources/org/apache/lucene/russian/morphology/analayzer/russian-text.txt @@ -1,5 +1,3 @@ -В условиях нарастающей пурги было сделано 4 успешных захода на посадку. После завершения облета и демонтажа оборудования -Рубен Есаян дал устную оценку эксперимента:"Все нормально, будем рекомендовать систему к внедрению". -Летом - с ноября по март - рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются -примерно один раз в две недели. Туда привозят людей, питание, оборудование, ГСМ и т.д. -что-то \ No newline at end of file +В условиях нарастающей пурги было сделано 4 успешных захода на посадку. "Все нормально, будем рекомендовать систему к внедрению". +Рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются +примерно один раз в две недели. \ No newline at end of file