diff --git a/etc/header.txt b/etc/header.txt
new file mode 100644
index 0000000..76f2dc1
--- /dev/null
+++ b/etc/header.txt
@@ -0,0 +1,13 @@
+Copyright 2009 Alexander Kuznetsov
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/pom.xml b/pom.xml
index 4a32899..2118b34 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,9 +4,27 @@
org.apache.lucene
russian-morpholgy
jar
- 1.0-SNAPSHOT
+ 0.5-SNAPSHOT
russian-morpholgy
http://maven.apache.org
+
+
+
+ russian-morpholgy
+
+ ../repo/releases
+
+
+
+ russian-morpholgy-snapshots
+
+ ../repo/snapshots
+
+ true
+
+
+
+
junit
@@ -29,8 +47,68 @@
-
+
+
+ maven2-repository.dev.java.net
+ Java.net Repository for Maven
+ http://download.java.net/maven/2/
+
+
+
+
+
+ mc-release
+ maven-license-plugin repository of releases
+ http://mc-repo.googlecode.com/svn/maven2/releases
+
+
+ false
+
+
+
+ true
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+
+
+
+
+ org.codehaus.mojo
+ cobertura-maven-plugin
+
+
+
+
+ org.apache.maven.plugins
+ maven-pmd-plugin
+
+ true
+ utf-8
+ 100
+ 1.5
+
+
+
+
+
+
+
+
+ org.jvnet.wagon-svn
+ wagon-svn
+ 1.8
+
+
+
+
+
org.apache.maven.plugins
maven-compiler-plugin
@@ -39,6 +117,34 @@
1.5
+
+
+ maven-license-plugin
+ com.mathieucarbou.mojo
+
+
+ ${project.parent.basedir}
+
+
+ **/*.txt
+
+
+ **/src/**
+ **/pom.xml
+
+
+
+
+
+ test
+
+ check
+
+
+
+
diff --git a/src/main/java/org/apache/lucene/App.java b/src/main/java/org/apache/lucene/App.java
deleted file mode 100644
index a566430..0000000
--- a/src/main/java/org/apache/lucene/App.java
+++ /dev/null
@@ -1,13 +0,0 @@
-package org.apache.lucene;
-
-/**
- * Hello world!
- *
- */
-public class App
-{
- public static void main( String[] args )
- {
- System.out.println( "Hello World!" );
- }
-}
diff --git a/src/main/java/org/apache/lucene/russian/morphology/EvristicBuilder.java b/src/main/java/org/apache/lucene/russian/morphology/EvristicBuilder.java
index 3633708..0f08828 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/EvristicBuilder.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/EvristicBuilder.java
@@ -1,14 +1,31 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.lucene.russian.morphology;
import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
+import org.apache.lucene.russian.morphology.evristics.Evristic;
import org.apache.lucene.russian.morphology.evristics.StatiticsCollectors;
import org.apache.lucene.russian.morphology.evristics.SuffixCounter;
-import org.apache.lucene.russian.morphology.evristics.Evristic;
-import java.io.*;
-import java.util.*;
-import java.util.concurrent.atomic.AtomicInteger;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Set;
public class EvristicBuilder {
@@ -23,12 +40,12 @@ public class EvristicBuilder {
Object[] objects = counterCollection.toArray();
Arrays.sort(objects);
System.out.println("Length " + objects.length + " ingored words " + statiticsCollectors.getIgnoredCount());
- for(int i = 0; i < 10; i++){
+ for (int i = 0; i < 10; i++) {
System.out.println(objects[i]);
}
final Evristic evristic = new Evristic();
- for(int i = 0; i < objects.length; i++){
+ for (int i = 0; i < objects.length; i++) {
evristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
}
diff --git a/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java b/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java
index 0ff57af..985ce5e 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoder.java
@@ -1,3 +1,19 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.lucene.russian.morphology;
/**
@@ -24,7 +40,7 @@ public class RussianSuffixDecoderEncoder {
c = DASH_CODE;
}
if (c == EE_CHAR) c = E_CHAR;
- if (c < 0 || c > 33) throw new WrongCharaterException();
+ if (c < 0 || c > 33) throw new WrongCharaterException();
result = result * 35L + c;
}
return result;
@@ -44,12 +60,12 @@ public class RussianSuffixDecoderEncoder {
return result;
}
- static public boolean checkCharacter(char c){
- int code = 0 + c;
- if(code == 45) return true;
- code -= RUSSIAN_SMALL_LETTER_OFFSET;
- if(code == 34) return true;
- if(code > 0 && code < 33) return true;
- return false;
+ static public boolean checkCharacter(char c) {
+ int code = 0 + c;
+ if (code == 45) return true;
+ code -= RUSSIAN_SMALL_LETTER_OFFSET;
+ if (code == 34) return true;
+ if (code > 0 && code < 33) return true;
+ return false;
}
}
diff --git a/src/main/java/org/apache/lucene/russian/morphology/SuffixToLongException.java b/src/main/java/org/apache/lucene/russian/morphology/SuffixToLongException.java
index eaaebfe..568ba05 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/SuffixToLongException.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/SuffixToLongException.java
@@ -1,3 +1,19 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.lucene.russian.morphology;
diff --git a/src/main/java/org/apache/lucene/russian/morphology/WrongCharaterException.java b/src/main/java/org/apache/lucene/russian/morphology/WrongCharaterException.java
index 830fb4b..e37c690 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/WrongCharaterException.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/WrongCharaterException.java
@@ -1,7 +1,23 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.lucene.russian.morphology;
-public class WrongCharaterException extends RuntimeException{
+public class WrongCharaterException extends RuntimeException {
public WrongCharaterException() {
}
diff --git a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzer.java b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzer.java
index 3337239..d073ca5 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzer.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzer.java
@@ -1,15 +1,31 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.lucene.russian.morphology.analayzer;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.LowerCaseFilter;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
-import java.io.Reader;
import java.io.IOException;
+import java.io.Reader;
-public class RussianMorphlogyAnalayzer extends Analyzer {
+public class RussianMorphlogyAnalayzer extends Analyzer {
private SuffixEvristics suffixEvristics;
public RussianMorphlogyAnalayzer() throws IOException {
@@ -20,6 +36,6 @@ public class RussianMorphlogyAnalayzer extends Analyzer {
TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
- return new RussianMorphlogyFilter(result,suffixEvristics);
+ return new RussianMorphlogyFilter(result, suffixEvristics);
}
}
diff --git a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java
index b324064..28172c6 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyFilter.java
@@ -1,3 +1,19 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.lucene.russian.morphology.analayzer;
import org.apache.lucene.analysis.Token;
@@ -17,11 +33,11 @@ public class RussianMorphlogyFilter extends TokenFilter {
public Token next(final Token reusableToken) throws IOException {
Token nextToken = input.next(reusableToken);
- if(nextToken == null || nextToken.term().length() == 0) return nextToken;
+ if (nextToken == null || nextToken.term().length() == 0) return nextToken;
String word = nextToken.term();
Character testC = word.charAt(0);
- if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC){
- return nextToken;
+ if (Character.UnicodeBlock.of(testC) != Character.UnicodeBlock.CYRILLIC) {
+ return nextToken;
}
Token current = (Token) nextToken.clone();
return createToken(suffixEvristics.getCanonicalForm(word), current, reusableToken);
diff --git a/src/main/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristics.java b/src/main/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristics.java
index e593511..90c8c46 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristics.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristics.java
@@ -1,10 +1,25 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.lucene.russian.morphology.analayzer;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import java.io.*;
import java.util.Arrays;
-import java.util.HashSet;
public class SuffixEvristics {
@@ -46,23 +61,23 @@ public class SuffixEvristics {
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
String suffixS = form.substring(startSymbol);
- if(!chechSuffix(suffixS)) return form;
+ if (!chechSuffix(suffixS)) return form;
Long suffix = RussianSuffixDecoderEncoder.encode(suffixS);
- int index = Arrays.binarySearch(keys,suffix);
- if(index < -1){
+ int index = Arrays.binarySearch(keys, suffix);
+ if (index < -1) {
System.out.println(" " + form);
return form;
- }else{
+ } else {
String nSuffix = RussianSuffixDecoderEncoder.decode(values[index]);
return startSymbol > 0 ? form.substring(0, startSymbol) + nSuffix : nSuffix;
}
}
- private boolean chechSuffix(String suffix){
- for(int i = 0; i < suffix.length(); i++){
+ private boolean chechSuffix(String suffix) {
+ for (int i = 0; i < suffix.length(); i++) {
if (!RussianSuffixDecoderEncoder.checkCharacter(suffix.charAt(i))) return false;
}
return true;
diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java
index bbf5679..42a2b11 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/DictonaryReader.java
@@ -1,9 +1,28 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.lucene.russian.morphology.dictonary;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
import java.util.*;
-import java.io.*;
/**
@@ -15,7 +34,7 @@ public class DictonaryReader {
private String fileEncoding = "windows-1251";
private List> wordsFlexias = new ArrayList>();
private List> wordPrefixes = new ArrayList>();
- private Set ingnoredForm = new HashSet();
+ private Set ingnoredForm = new HashSet();
public DictonaryReader(String fileName, Set ingnoredForm) {
this.fileName = fileName;
@@ -35,11 +54,11 @@ public class DictonaryReader {
sckipBlock(bufferedReader);
sckipBlock(bufferedReader);
readPrefix(bufferedReader);
- readWords(bufferedReader,wordProccessor);
+ readWords(bufferedReader, wordProccessor);
}
- private void readWords(BufferedReader reader,WordProccessor wordProccessor) throws IOException {
+ private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
String s = reader.readLine();
int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) {
@@ -54,15 +73,15 @@ public class DictonaryReader {
if (models.size() > 0 && !ingnoredForm.contains(models.get(0).getCode())) {
WordCard card = new WordCard(cleanString(models.get(0).create(word)));
for (FlexiaModel fm : models) {
- card.addFrom(cleanString(fm.create(word)));
+ card.addFrom(cleanString(fm.create(word)));
}
wordProccessor.proccess(card);
}
}
}
- private String cleanString(String s){
- return s.replace((char)(34 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET),(char)(6 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
+ private String cleanString(String s) {
+ return s.replace((char) (34 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
}
private void sckipBlock(BufferedReader reader) throws IOException {
@@ -99,8 +118,8 @@ public class DictonaryReader {
private void addFlexia(ArrayList flexiaModelArrayList, String line) {
String[] fl = line.split("\\*");
// we inored all forms thats
- // if (fl.length == 3)
- // flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
+ // if (fl.length == 3)
+ // flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), fl[2].toLowerCase()));
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
}
diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java
index e983311..3ae24e1 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/FlexiaModel.java
@@ -1,3 +1,19 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.lucene.russian.morphology.dictonary;
/**
diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/IgnoredFormReader.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/IgnoredFormReader.java
index a6568c0..28100b5 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/IgnoredFormReader.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/IgnoredFormReader.java
@@ -1,11 +1,27 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.lucene.russian.morphology.dictonary;
-import java.util.Set;
-import java.util.HashSet;
import java.io.BufferedReader;
-import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.HashSet;
+import java.util.Set;
public class IgnoredFormReader {
diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java
index a37b107..770bca3 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordCard.java
@@ -1,7 +1,23 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.lucene.russian.morphology.dictonary;
-import java.util.List;
import java.util.ArrayList;
+import java.util.List;
/**
* Represent word and all it forms.
@@ -14,7 +30,7 @@ public class WordCard {
this.canonicalFrom = canonicalFrom;
}
- protected void addFrom(String word){
+ protected void addFrom(String word) {
wordsFroms.add(word);
}
diff --git a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java
index 3f93f43..37e769a 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/dictonary/WordProccessor.java
@@ -1,9 +1,25 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.lucene.russian.morphology.dictonary;
import java.io.IOException;
/**
- * Interface allows get information from
+ * Interface allows get information from
* {@org.apache.lucene.russian.morphology.dictonary.DirtonaryReader}.
*/
public interface WordProccessor {
diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java
index 79ce5f4..2ef8a15 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/Evristic.java
@@ -1,9 +1,28 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.lucene.russian.morphology.evristics;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
-import java.util.*;
-import java.io.*;
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.TreeMap;
public class Evristic {
@@ -35,7 +54,7 @@ public class Evristic {
String s = reader.readLine();
while (s != null) {
String[] sfns = s.split(" ");
- if(sfns.length == 2){
+ if (sfns.length == 2) {
encodedSuffixesPairs.put(Long.valueOf(sfns[0]), Long.valueOf(sfns[0]));
}
s = reader.readLine();
@@ -45,8 +64,8 @@ public class Evristic {
public void writeToFile(String file) throws IOException {
FileWriter writer = new FileWriter(file);
- writer.write(encodedSuffixesPairs.size()+"\n");
- for(Long k:encodedSuffixesPairs.keySet()){
+ writer.write(encodedSuffixesPairs.size() + "\n");
+ for (Long k : encodedSuffixesPairs.keySet()) {
writer.write("" + k + " " + encodedSuffixesPairs.get(k) + "\n");
}
writer.close();
diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java
index e3271ff..55f6cae 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/StatiticsCollectors.java
@@ -1,26 +1,42 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.lucene.russian.morphology.evristics;
-import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
-import org.apache.lucene.russian.morphology.dictonary.WordCard;
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
+import org.apache.lucene.russian.morphology.dictonary.WordCard;
+import org.apache.lucene.russian.morphology.dictonary.WordProccessor;
-import java.util.Map;
import java.util.HashMap;
+import java.util.Map;
-public class StatiticsCollectors implements WordProccessor{
- Map statititics = new HashMap();
+public class StatiticsCollectors implements WordProccessor {
+ Map statititics = new HashMap();
private Integer ignoredCount = 0;
public void proccess(WordCard wordCard) {
- for(String form:wordCard.getWordsFroms()){
+ for (String form : wordCard.getWordsFroms()) {
SuffixEvristic suffixEvristic = createEvristic(wordCard.getCanonicalFrom(), form);
if (suffixEvristic == null) continue;
SuffixCounter suffixCounter = statititics.get(suffixEvristic);
- if(suffixCounter == null){
+ if (suffixCounter == null) {
suffixCounter = new SuffixCounter(suffixEvristic);
- statititics.put(suffixEvristic,suffixCounter);
+ statititics.put(suffixEvristic, suffixCounter);
}
suffixCounter.incrementAmount();
}
@@ -30,19 +46,19 @@ public class StatiticsCollectors implements WordProccessor{
return statititics;
}
- private SuffixEvristic createEvristic(String word,String form){
+ private SuffixEvristic createEvristic(String word, String form) {
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
String formSuffix = form.substring(startSymbol);
- if(word.length() < startSymbol){
+ if (word.length() < startSymbol) {
ignoredCount++;
- return null;
+ return null;
}
String wordSuffix = word.length() > startSymbol ? word.substring(startSymbol) : "";
- if (wordSuffix.length() > 12){
+ if (wordSuffix.length() > 12) {
System.out.println(word + " " + form);
return null;
}
- return new SuffixEvristic(formSuffix,wordSuffix);
+ return new SuffixEvristic(formSuffix, wordSuffix);
}
diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java
index 65462a8..11401a8 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixCounter.java
@@ -1,10 +1,26 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.lucene.russian.morphology.evristics;
/**
* Conains information of freqency of suffix evristic
- * in dictionary.
+ * in dictionary.
*/
-public class SuffixCounter implements Comparable{
+public class SuffixCounter implements Comparable {
private SuffixEvristic suffixEvristic;
private Double amnout = 0.0;
@@ -12,7 +28,7 @@ public class SuffixCounter implements Comparable{
this.suffixEvristic = suffixEvristic;
}
- public void incrementAmount(){
+ public void incrementAmount() {
amnout++;
}
@@ -33,12 +49,12 @@ public class SuffixCounter implements Comparable{
}
public int compareTo(Object o) {
- if(o instanceof SuffixCounter) return (int) Math.round(Math.signum(((SuffixCounter)o).amnout - amnout));
+ if (o instanceof SuffixCounter) return (int) Math.round(Math.signum(((SuffixCounter) o).amnout - amnout));
return -1;
}
@Override
public String toString() {
- return ""+amnout + " " + suffixEvristic.toString();
+ return "" + amnout + " " + suffixEvristic.toString();
}
}
diff --git a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java
index ab5e1df..cc4621d 100644
--- a/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java
+++ b/src/main/java/org/apache/lucene/russian/morphology/evristics/SuffixEvristic.java
@@ -1,3 +1,19 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.lucene.russian.morphology.evristics;
/**
diff --git a/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java b/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java
index bac6fc7..bf77d12 100644
--- a/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java
+++ b/src/test/java/org/apache/lucene/russian/morphology/RussianSuffixDecoderEncoderTest.java
@@ -1,39 +1,54 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.lucene.russian.morphology;
-import org.junit.Test;
-import static org.junit.Assert.assertThat;
import static org.hamcrest.core.IsEqual.equalTo;
-import org.apache.lucene.russian.morphology.SuffixToLongException;
+import static org.junit.Assert.assertThat;
+import org.junit.Test;
-import java.io.InputStream;
import java.io.BufferedReader;
-import java.io.InputStreamReader;
import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
public class RussianSuffixDecoderEncoderTest {
@Test
public void testShouldCorretDecodeEncode() throws IOException {
- InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/decoder-test-data.txt");
- BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8"));
- String s = bufferedReader.readLine();
- while(s != null){
+ InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/decoder-test-data.txt");
+ BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
+ String s = bufferedReader.readLine();
+ while (s != null) {
String[] qa = s.trim().split(" ");
Long ecodedSuffix = RussianSuffixDecoderEncoder.encode(qa[0]);
- assertThat(RussianSuffixDecoderEncoder.decode(ecodedSuffix),equalTo(qa[1]));
+ assertThat(RussianSuffixDecoderEncoder.decode(ecodedSuffix), equalTo(qa[1]));
s = bufferedReader.readLine();
}
}
@Test(expected = SuffixToLongException.class)
- public void shouldThrownExeptionIfSuffixToLong(){
- RussianSuffixDecoderEncoder.encode("1234567890123");
+ public void shouldThrownExeptionIfSuffixToLong() {
+ RussianSuffixDecoderEncoder.encode("1234567890123");
}
@Test(expected = WrongCharaterException.class)
- public void shouldThrownExeptionIfSuffixContainWrongCharater(){
- RussianSuffixDecoderEncoder.encode("1");
- }
-
+ public void shouldThrownExeptionIfSuffixContainWrongCharater() {
+ RussianSuffixDecoderEncoder.encode("1");
+ }
+
}
diff --git a/src/test/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzerTest.java b/src/test/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzerTest.java
index 899d65c..61c8d59 100644
--- a/src/test/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzerTest.java
+++ b/src/test/java/org/apache/lucene/russian/morphology/analayzer/RussianMorphlogyAnalayzerTest.java
@@ -1,13 +1,28 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.lucene.russian.morphology.analayzer;
-import junit.framework.TestCase;
-import org.junit.Test;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.junit.Test;
+import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
-import java.io.BufferedReader;
import java.io.InputStreamReader;
@@ -17,24 +32,22 @@ public class RussianMorphlogyAnalayzerTest {
public void shouldCorrectProccessText() throws IOException {
RussianMorphlogyAnalayzer morphlogyAnalayzer = new RussianMorphlogyAnalayzer();
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/russian-text.txt");
- BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8"));
+ BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
final Token reusableToken = new Token();
- Token nextToken;
+ Token nextToken;
TokenStream in = morphlogyAnalayzer.tokenStream(null, bufferedReader);
- for (;;)
- {
- nextToken = in.next(reusableToken);
+ for (; ;) {
+ nextToken = in.next(reusableToken);
- if (nextToken == null)
- {
- break;
- }
+ if (nextToken == null) {
+ break;
+ }
- System.out.println(nextToken.term());
+ System.out.println(nextToken.term());
// nextSampleToken = sample.next(reusableSampleToken);
// assertEquals(
// "Unicode",
@@ -42,7 +55,7 @@ public class RussianMorphlogyAnalayzerTest {
// nextSampleToken == null
// ? null
// : nextSampleToken.term());
- }
+ }
}
}
diff --git a/src/test/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristicsTest.java b/src/test/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristicsTest.java
index 0d6e367..7191853 100644
--- a/src/test/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristicsTest.java
+++ b/src/test/java/org/apache/lucene/russian/morphology/analayzer/SuffixEvristicsTest.java
@@ -1,24 +1,42 @@
+/**
+ * Copyright 2009 Alexander Kuznetsov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.lucene.russian.morphology.analayzer;
-import org.junit.Test;
-import static org.junit.Assert.assertThat;
-import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
import static org.hamcrest.core.IsEqual.equalTo;
+import static org.junit.Assert.assertThat;
+import org.junit.Test;
-import java.io.*;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
public class SuffixEvristicsTest {
@Test
public void testShouldDefineCorretCononicalWordForm() throws IOException {
- SuffixEvristics suffixEvristics = new SuffixEvristics();
- InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt");
- BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream,"UTF-8"));
- String s = bufferedReader.readLine();
- while(s != null){
+ SuffixEvristics suffixEvristics = new SuffixEvristics();
+ InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt");
+ BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
+ String s = bufferedReader.readLine();
+ while (s != null) {
String[] qa = s.trim().split(" ");
- assertThat(suffixEvristics.getCanonicalForm(qa[0]),equalTo(qa[1]));
+ assertThat(suffixEvristics.getCanonicalForm(qa[0]), equalTo(qa[1]));
s = bufferedReader.readLine();
}
}
diff --git a/src/test/resources/org/apache/lucene/russian/morphology/analayzer/russian-text.txt b/src/test/resources/org/apache/lucene/russian/morphology/analayzer/russian-text.txt
index ce77ba2..88e3e54 100644
--- a/src/test/resources/org/apache/lucene/russian/morphology/analayzer/russian-text.txt
+++ b/src/test/resources/org/apache/lucene/russian/morphology/analayzer/russian-text.txt
@@ -1,5 +1,3 @@
-В условиях нарастающей пурги было сделано 4 успешных захода на посадку. После завершения облета и демонтажа оборудования
-Рубен Есаян дал устную оценку эксперимента:"Все нормально, будем рекомендовать систему к внедрению".
-Летом - с ноября по март - рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются
-примерно один раз в две недели. Туда привозят людей, питание, оборудование, ГСМ и т.д.
-что-то
\ No newline at end of file
+В условиях нарастающей пурги было сделано 4 успешных захода на посадку. "Все нормально, будем рекомендовать систему к внедрению".
+Рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются
+примерно один раз в две недели.
\ No newline at end of file