Compare commits
	
		
			4 Commits
		
	
	
		
			morphology
			...
			ambiguousl
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | f095cbe7c0 | ||
|   | 3b2e48821a | ||
|   | 6ca2b27781 | ||
|   | 393665f08a | 
							
								
								
									
										70
									
								
								README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										70
									
								
								README.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,70 @@ | |||||||
|  | # Russian Morphology for lucene | ||||||
|  |  | ||||||
|  | Russian and English morphology for java and lucene 3.0 framework based on open source dictionary from site [АОТ](http://aot.ru). It use dictionary base morphology with some heuristics for unknown words. It support homonym for example for Russian word "вина" it gives two variants "вино" and "вина".  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ### How to use | ||||||
|  |  | ||||||
|  | First download  | ||||||
|  | [morph-1.0.jar](https://bintray.com/artifact/download/akuznetsov/russianmorphology/org/apache/lucene/morphology/morph/1.1/morph-1.1.jar)   | ||||||
|  | and add it to your class path. When download [Russian](https://bintray.com/artifact/download/akuznetsov/russianmorphology/org/apache/lucene/morphology/russian/1.1/russian-1.1.jar) or  | ||||||
|  | [English](https://bintray.com/artifact/download/akuznetsov/russianmorphology/org/apache/lucene/morphology/english/1.1/english-1.1.jar) package.  | ||||||
|  |  | ||||||
|  | If you use maven you can add dependency  | ||||||
|  |  | ||||||
|  |         <dependency> | ||||||
|  |             <groupId>org.apache.lucene.morphology</groupId> | ||||||
|  |             <artifactId>russian</artifactId> | ||||||
|  |             <version>1.1</version> | ||||||
|  |         </dependency> | ||||||
|  |  | ||||||
|  |  | ||||||
|  |         <dependency> | ||||||
|  |             <groupId>org.apache.lucene.morphology</groupId> | ||||||
|  |             <artifactId>english</artifactId> | ||||||
|  |             <version>1.1</version> | ||||||
|  |         </dependency> | ||||||
|  |  | ||||||
|  | Don't forget add link to repository | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     <repositories> | ||||||
|  |     ............... | ||||||
|  |       <repository> | ||||||
|  |         <snapshots> | ||||||
|  |           <enabled>false</enabled> | ||||||
|  |         </snapshots> | ||||||
|  |         <id>bintray-akuznetsov-russianmorphology</id> | ||||||
|  |         <name>bintray</name> | ||||||
|  |         <url>http://dl.bintray.com/akuznetsov/russianmorphology</url> | ||||||
|  |       </repository> | ||||||
|  |     </repositories> | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | Now you can create a Lucene Analyzer  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |       RussianAnalayzer russian = new RussianAnalayzer(); | ||||||
|  |       EnglishAnalayzer english = new EnglishAnalayzer(); | ||||||
|  |  | ||||||
|  |  | ||||||
|  | You can write you own analyzer using filter that convert word in it's right forms.  | ||||||
|  |  | ||||||
|  |       LuceneMorphology luceneMorph = new EnglishLuceneMorphology(); | ||||||
|  |       TokenStream tokenStream = new MorphlogyFilter(result, luceneMorph); | ||||||
|  |  | ||||||
|  | Because usually LuceneMorphology contains a lot data needing for it functionality, it is better didn't create this object for each MorphologyFilter. | ||||||
|  |  | ||||||
|  | Also if you need get a list of base forms of word, you can use following example  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |      LuceneMorphology luceneMorph = new EnglishLuceneMorphology(); | ||||||
|  |      List<String> wordBaseForms = luceneMorph.getMorphInfo(word); | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ### Restrictions | ||||||
|  |    | ||||||
|  |   * It works only with UTF-8. | ||||||
|  |   * It assume what letters е and ё are the same. | ||||||
|  |   * Word forms with prefixes like "наибольший" treated as separate word.  | ||||||
							
								
								
									
										36
									
								
								context/pom.xml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								context/pom.xml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,36 @@ | |||||||
|  | <?xml version="1.0"?> | ||||||
|  | <project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" | ||||||
|  |          xmlns="http://maven.apache.org/POM/4.0.0" | ||||||
|  |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> | ||||||
|  |     <modelVersion>4.0.0</modelVersion> | ||||||
|  |     <parent> | ||||||
|  |         <groupId>org.apache.lucene.morphology</groupId> | ||||||
|  |         <artifactId>morphology</artifactId> | ||||||
|  |         <version>1.2-SNAPSHOT</version> | ||||||
|  |     </parent> | ||||||
|  |     <groupId>org.apache.lucene.morphology</groupId> | ||||||
|  |     <artifactId>context</artifactId> | ||||||
|  |     <version>1.0-SNAPSHOT</version> | ||||||
|  |     <name>context</name> | ||||||
|  |     <url>http://maven.apache.org</url> | ||||||
|  |     <dependencies> | ||||||
|  |         <dependency> | ||||||
|  |             <groupId>junit</groupId> | ||||||
|  |             <artifactId>junit</artifactId> | ||||||
|  |             <version>4.8.2</version> | ||||||
|  |             <scope>test</scope> | ||||||
|  |         </dependency> | ||||||
|  |         <dependency> | ||||||
|  |             <groupId>org.apache.lucene.morphology</groupId> | ||||||
|  |             <artifactId>russian</artifactId> | ||||||
|  |             <version>1.2-SNAPSHOT</version> | ||||||
|  |             <scope>test</scope> | ||||||
|  |         </dependency> | ||||||
|  |         <dependency> | ||||||
|  |             <groupId>org.apache.lucene.morphology</groupId> | ||||||
|  |             <artifactId>english</artifactId> | ||||||
|  |             <version>1.2-SNAPSHOT</version> | ||||||
|  |             <scope>test</scope> | ||||||
|  |         </dependency> | ||||||
|  |     </dependencies> | ||||||
|  | </project> | ||||||
| @@ -0,0 +1,52 @@ | |||||||
|  | /** | ||||||
|  |  * Copyright 2015 Alexander Kuznetsov | ||||||
|  |  * | ||||||
|  |  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |  * you may not use this file except in compliance with the License. | ||||||
|  |  * You may obtain a copy of the License at | ||||||
|  |  * | ||||||
|  |  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  |  * | ||||||
|  |  * Unless required by applicable law or agreed to in writing, software | ||||||
|  |  * distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |  * See the License for the specific language governing permissions and | ||||||
|  |  * limitations under the License. | ||||||
|  |  */ | ||||||
|  | package org.apache.lucene.morphology.context; | ||||||
|  |  | ||||||
|  | import org.apache.lucene.analysis.Analyzer; | ||||||
|  | import org.apache.lucene.analysis.TokenStream; | ||||||
|  | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||||||
|  | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | ||||||
|  |  | ||||||
|  | import java.io.ByteArrayInputStream; | ||||||
|  | import java.io.IOException; | ||||||
|  | import java.io.InputStreamReader; | ||||||
|  | import java.util.ArrayList; | ||||||
|  | import java.util.LinkedList; | ||||||
|  | import java.util.List; | ||||||
|  |  | ||||||
|  | public class CalculateContextItem { | ||||||
|  |  | ||||||
|  |     public List<ContextItem> createContextItems(String text) throws IOException { | ||||||
|  |         Analyzer statAnalyzer = new StatAnalyzer(); | ||||||
|  |         InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год? - и что я жду тебя, где вино".getBytes()), "UTF-8"); | ||||||
|  |  | ||||||
|  |  | ||||||
|  | //        new RussianMorphology(); | ||||||
|  |  | ||||||
|  |         TokenStream tokenStream = statAnalyzer.tokenStream(null, reader); | ||||||
|  |         tokenStream.reset(); | ||||||
|  |  | ||||||
|  |         List<List<String>> listedLink =  new LinkedList<>(); | ||||||
|  |         while (tokenStream.incrementToken()) { | ||||||
|  |             CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); | ||||||
|  |             PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         return null; | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -0,0 +1,80 @@ | |||||||
|  | /** | ||||||
|  |  * Copyright 2015 Alexander Kuznetsov | ||||||
|  |  * | ||||||
|  |  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |  * you may not use this file except in compliance with the License. | ||||||
|  |  * You may obtain a copy of the License at | ||||||
|  |  * | ||||||
|  |  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  |  * | ||||||
|  |  * Unless required by applicable law or agreed to in writing, software | ||||||
|  |  * distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |  * See the License for the specific language governing permissions and | ||||||
|  |  * limitations under the License. | ||||||
|  |  */ | ||||||
|  | package org.apache.lucene.morphology.context; | ||||||
|  |  | ||||||
|  | import java.util.Arrays; | ||||||
|  |  | ||||||
|  | public class ContextItem implements Comparable<ContextItem> { | ||||||
|  |     String[][] morphInfo; | ||||||
|  |  | ||||||
|  |     public ContextItem(String[][] morphInfo) { | ||||||
|  |         this.morphInfo = morphInfo; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     public String[][] getMorphInfo() { | ||||||
|  |         return morphInfo; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     public void setMorphInfo(String[][] morphInfo) { | ||||||
|  |         this.morphInfo = morphInfo; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     public int hashCode() { | ||||||
|  |         int h = 0; | ||||||
|  |         for (String[] m : morphInfo) { | ||||||
|  |             for (String s : m) { | ||||||
|  |                 h = 31 * h + s.hashCode(); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         return h; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     @Override | ||||||
|  |     public boolean equals(Object o) { | ||||||
|  |         if (this == o) return true; | ||||||
|  |         if (o == null || getClass() != o.getClass()) return false; | ||||||
|  |  | ||||||
|  |         ContextItem that = (ContextItem) o; | ||||||
|  |  | ||||||
|  |         if (that.morphInfo.length != this.morphInfo.length) { | ||||||
|  |             return false; | ||||||
|  |         } | ||||||
|  |         for (int i = 0; i < morphInfo.length; i++) { | ||||||
|  |             if (!Arrays.equals(morphInfo[i], that.morphInfo[i])) { | ||||||
|  |                 return false; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         return true; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     @Override | ||||||
|  |     public int compareTo(ContextItem o) { | ||||||
|  |         int i = o.morphInfo.length - morphInfo.length; | ||||||
|  |         if (i != 0) return i; | ||||||
|  |         for (int j = 0; j < morphInfo.length; j++) { | ||||||
|  |             i = o.morphInfo[j].length - morphInfo[j].length; | ||||||
|  |             if (i != 0) return i; | ||||||
|  |             for (int k = 0; k < morphInfo[j].length; k++) { | ||||||
|  |                 i = morphInfo[j][k].compareTo(o.morphInfo[j][k]); | ||||||
|  |                 if (i != 0) return i; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         return 0; | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -0,0 +1,37 @@ | |||||||
|  | /** | ||||||
|  |  * Copyright 2015 Alexander Kuznetsov | ||||||
|  |  * | ||||||
|  |  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |  * you may not use this file except in compliance with the License. | ||||||
|  |  * You may obtain a copy of the License at | ||||||
|  |  * | ||||||
|  |  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  |  * | ||||||
|  |  * Unless required by applicable law or agreed to in writing, software | ||||||
|  |  * distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |  * See the License for the specific language governing permissions and | ||||||
|  |  * limitations under the License. | ||||||
|  |  */ | ||||||
|  | package org.apache.lucene.morphology.context; | ||||||
|  |  | ||||||
|  | public class ContextStats { | ||||||
|  |     String[] morphInfo; | ||||||
|  |     double prob; | ||||||
|  |  | ||||||
|  |     public String[] getMorphInfo() { | ||||||
|  |         return morphInfo; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     public void setMorphInfo(String[] morphInfo) { | ||||||
|  |         this.morphInfo = morphInfo; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     public double getProb() { | ||||||
|  |         return prob; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     public void setProb(double prob) { | ||||||
|  |         this.prob = prob; | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -0,0 +1,21 @@ | |||||||
|  | /** | ||||||
|  |  * Copyright 2015 Alexander Kuznetsov | ||||||
|  |  * | ||||||
|  |  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |  * you may not use this file except in compliance with the License. | ||||||
|  |  * You may obtain a copy of the License at | ||||||
|  |  * | ||||||
|  |  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  |  * | ||||||
|  |  * Unless required by applicable law or agreed to in writing, software | ||||||
|  |  * distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |  * See the License for the specific language governing permissions and | ||||||
|  |  * limitations under the License. | ||||||
|  |  */ | ||||||
|  | package org.apache.lucene.morphology.context; | ||||||
|  |  | ||||||
|  | public class ProbClalucator { | ||||||
|  |  | ||||||
|  |  | ||||||
|  | } | ||||||
| @@ -0,0 +1,116 @@ | |||||||
|  | /** | ||||||
|  |  * Copyright 2015 Alexander Kuznetsov | ||||||
|  |  * | ||||||
|  |  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |  * you may not use this file except in compliance with the License. | ||||||
|  |  * You may obtain a copy of the License at | ||||||
|  |  * | ||||||
|  |  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  |  * | ||||||
|  |  * Unless required by applicable law or agreed to in writing, software | ||||||
|  |  * distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |  * See the License for the specific language governing permissions and | ||||||
|  |  * limitations under the License. | ||||||
|  |  */ | ||||||
|  | package org.apache.lucene.morphology.context; | ||||||
|  |  | ||||||
|  | import org.apache.lucene.analysis.Tokenizer; | ||||||
|  | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||||||
|  | import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl; | ||||||
|  | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | ||||||
|  | import org.apache.lucene.util.AttributeFactory; | ||||||
|  |  | ||||||
|  | import java.io.BufferedReader; | ||||||
|  | import java.io.IOException; | ||||||
|  | import java.util.Arrays; | ||||||
|  | import java.util.HashSet; | ||||||
|  | import java.util.LinkedList; | ||||||
|  | import java.util.Set; | ||||||
|  |  | ||||||
|  | public class SimpleTokenizer extends Tokenizer { | ||||||
|  |  | ||||||
|  |     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); | ||||||
|  |     private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class); | ||||||
|  |     LinkedList<String> terms; | ||||||
|  |  | ||||||
|  |     public final static Set<Character> SEPARATION_LETTERS = new HashSet<>(Arrays.asList(' ', '(', ')', ',', '|', '\t', | ||||||
|  |             '\n', '"', ':', '!', '?', ',', ';', '•')); | ||||||
|  |  | ||||||
|  |     public final static Set<Character> MEANING_CHARS = new HashSet<>(Arrays.asList('(', ')', ',', '|', | ||||||
|  |             '"', ':', '!', '?', ',', ';', '•', '.')); | ||||||
|  |  | ||||||
|  |     public SimpleTokenizer() { | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     public SimpleTokenizer(AttributeFactory factory) { | ||||||
|  |         super(factory); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     @Override | ||||||
|  |     final public boolean incrementToken() throws IOException { | ||||||
|  |         if (terms == null) { | ||||||
|  |             createTeams(); | ||||||
|  |         } | ||||||
|  |         if (terms.size() > 0) { | ||||||
|  |             String str = terms.poll(); | ||||||
|  |             termAtt.setEmpty(); | ||||||
|  |             termAtt.append(str); | ||||||
|  |             posAtt.setPositionIncrement(1); | ||||||
|  |             return true; | ||||||
|  |         } | ||||||
|  |         return false; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     private void createTeams() throws IOException { | ||||||
|  |         terms = new LinkedList<>(); | ||||||
|  |  | ||||||
|  |         BufferedReader br = new BufferedReader(input); | ||||||
|  |         StringBuilder sb = new StringBuilder(); | ||||||
|  |         String s = ""; | ||||||
|  |         while ((s = br.readLine()) != null) { | ||||||
|  |             sb.append(s).append(" "); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         s = sb.toString(); | ||||||
|  |         CharTermAttributeImpl currentTerm = new CharTermAttributeImpl(); | ||||||
|  |         for (int i = 0; i < s.length(); i++) { | ||||||
|  |             if (checkIsCharSepartor(s, i)) { | ||||||
|  |                 if (checkIsCharHasMeaning(s, i)) { | ||||||
|  |                     terms.add(s.substring(i, i + 1)); | ||||||
|  |                 } | ||||||
|  |                 String term = currentTerm.toString(); | ||||||
|  |                 currentTerm.clear(); | ||||||
|  |                 if (term.length() > 0) { | ||||||
|  |                     terms.add(term); | ||||||
|  |                 } | ||||||
|  |             } else { | ||||||
|  |                 currentTerm.append(s.charAt(i)); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     private boolean checkIsCharHasMeaning(String s, int i) { | ||||||
|  |         return MEANING_CHARS.contains(s.charAt(i)); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     private boolean checkIsCharSepartor(String s, int i) { | ||||||
|  |         char c = s.charAt(i); | ||||||
|  |         if (SEPARATION_LETTERS.contains(c)) { | ||||||
|  |             return true; | ||||||
|  |         } | ||||||
|  |         if ('.' == c | ||||||
|  |                 && s.length() > i + 1 | ||||||
|  |                 && SEPARATION_LETTERS.contains(s.charAt(i + 1))) { | ||||||
|  |             return true; | ||||||
|  |         } | ||||||
|  |         return false; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     @Override | ||||||
|  |     public void reset() throws IOException { | ||||||
|  |         this.terms = null; | ||||||
|  |         super.reset(); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  | } | ||||||
| @@ -0,0 +1,34 @@ | |||||||
|  | package org.apache.lucene.morphology.context; | ||||||
|  |  | ||||||
|  | import org.apache.lucene.analysis.Analyzer; | ||||||
|  | import org.apache.lucene.analysis.TokenFilter; | ||||||
|  | import org.apache.lucene.analysis.core.LowerCaseFilter; | ||||||
|  | import org.apache.lucene.analysis.standard.StandardFilter; | ||||||
|  |  | ||||||
|  | import java.io.IOException; | ||||||
|  | import java.io.Reader; | ||||||
|  |  | ||||||
|  | /** | ||||||
|  |  * Created by akuznetsov on 6/24/15. | ||||||
|  |  */ | ||||||
|  | public class StatAnalyzer extends Analyzer { | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     @Override | ||||||
|  |     protected TokenStreamComponents createComponents(String s) { | ||||||
|  |  | ||||||
|  |         SimpleTokenizer src = new SimpleTokenizer(); | ||||||
|  |         TokenFilter filter = new StandardFilter(src); | ||||||
|  |         filter = new LowerCaseFilter(filter); | ||||||
|  |  | ||||||
|  |         return new TokenStreamComponents(src, filter) { | ||||||
|  |             @Override | ||||||
|  |             protected void setReader(final Reader reader) throws IOException { | ||||||
|  |                 super.setReader(reader); | ||||||
|  |             } | ||||||
|  |         }; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | } | ||||||
| @@ -0,0 +1,32 @@ | |||||||
|  | package org.apache.lucene.morphology.context; | ||||||
|  |  | ||||||
|  |  | ||||||
|  | import org.apache.lucene.analysis.Analyzer; | ||||||
|  | import org.apache.lucene.analysis.TokenStream; | ||||||
|  | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||||||
|  | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | ||||||
|  | import org.junit.Test; | ||||||
|  |  | ||||||
|  | import java.io.ByteArrayInputStream; | ||||||
|  | import java.io.IOException; | ||||||
|  | import java.io.InputStreamReader; | ||||||
|  |  | ||||||
|  | public class SimpleTokenizerTest { | ||||||
|  |  | ||||||
|  |     @Test | ||||||
|  |     public void testSimpleTokenizer() throws IOException { | ||||||
|  |         Analyzer statAnalyzer = new StatAnalyzer(); | ||||||
|  |         InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год? - и что я жду тебя, где вино".getBytes()), "UTF-8"); | ||||||
|  |  | ||||||
|  |         TokenStream tokenStream = statAnalyzer.tokenStream(null, reader); | ||||||
|  |         tokenStream.reset(); | ||||||
|  |  | ||||||
|  |         boolean wordSeen = false; | ||||||
|  |         while (tokenStream.incrementToken()) { | ||||||
|  |             CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); | ||||||
|  |             PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class); | ||||||
|  |             System.out.println(charTerm.toString()); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  | } | ||||||
| @@ -3,27 +3,27 @@ | |||||||
|     <parent> |     <parent> | ||||||
|         <artifactId>morphology</artifactId> |         <artifactId>morphology</artifactId> | ||||||
|         <groupId>org.apache.lucene.morphology</groupId> |         <groupId>org.apache.lucene.morphology</groupId> | ||||||
|         <version>1.1</version> |         <version>1.2-SNAPSHOT</version> | ||||||
|     </parent> |     </parent> | ||||||
|     <modelVersion>4.0.0</modelVersion> |     <modelVersion>4.0.0</modelVersion> | ||||||
|     <groupId>org.apache.lucene.morphology</groupId> |     <groupId>org.apache.lucene.morphology</groupId> | ||||||
|     <artifactId>dictionary-reader</artifactId> |     <artifactId>dictionary-reader</artifactId> | ||||||
|     <name>dictionary-reader</name> |     <name>dictionary-reader</name> | ||||||
|     <version>1.1</version> |     <version>1.2-SNAPSHOT</version> | ||||||
|     <url>http://maven.apache.org</url> |     <url>http://maven.apache.org</url> | ||||||
|  |  | ||||||
|     <dependencies> |     <dependencies> | ||||||
|         <dependency> |         <dependency> | ||||||
|             <groupId>org.apache.lucene.morphology</groupId> |             <groupId>org.apache.lucene.morphology</groupId> | ||||||
|             <artifactId>russian</artifactId> |             <artifactId>russian</artifactId> | ||||||
|             <version>1.1</version> |             <version>1.2-SNAPSHOT</version> | ||||||
|         </dependency> |         </dependency> | ||||||
|  |  | ||||||
|  |  | ||||||
|         <dependency> |         <dependency> | ||||||
|             <groupId>org.apache.lucene.morphology</groupId> |             <groupId>org.apache.lucene.morphology</groupId> | ||||||
|             <artifactId>english</artifactId> |             <artifactId>english</artifactId> | ||||||
|             <version>1.1</version> |             <version>1.2-SNAPSHOT</version> | ||||||
|         </dependency> |         </dependency> | ||||||
|     </dependencies> |     </dependencies> | ||||||
|  |  | ||||||
|   | |||||||
| @@ -3,20 +3,20 @@ | |||||||
|     <parent> |     <parent> | ||||||
|         <artifactId>morphology</artifactId> |         <artifactId>morphology</artifactId> | ||||||
|         <groupId>org.apache.lucene.morphology</groupId> |         <groupId>org.apache.lucene.morphology</groupId> | ||||||
|         <version>1.1</version> |         <version>1.2-SNAPSHOT</version> | ||||||
|     </parent> |     </parent> | ||||||
|     <modelVersion>4.0.0</modelVersion> |     <modelVersion>4.0.0</modelVersion> | ||||||
|     <groupId>org.apache.lucene.morphology</groupId> |     <groupId>org.apache.lucene.morphology</groupId> | ||||||
|     <artifactId>english</artifactId> |     <artifactId>english</artifactId> | ||||||
|     <name>english</name> |     <name>english</name> | ||||||
|     <version>1.1</version> |     <version>1.2-SNAPSHOT</version> | ||||||
|     <url>http://maven.apache.org</url> |     <url>http://maven.apache.org</url> | ||||||
|     <dependencies> |     <dependencies> | ||||||
|  |  | ||||||
|         <dependency> |         <dependency> | ||||||
|             <groupId>org.apache.lucene.morphology</groupId> |             <groupId>org.apache.lucene.morphology</groupId> | ||||||
|             <artifactId>morph</artifactId> |             <artifactId>morph</artifactId> | ||||||
|             <version>1.1</version> |             <version>1.2-SNAPSHOT</version> | ||||||
|         </dependency> |         </dependency> | ||||||
|  |  | ||||||
|         <dependency> |         <dependency> | ||||||
|   | |||||||
| @@ -1,4 +1,4 @@ | |||||||
| Copyright 2009 Alexander Kuznetsov  | Copyright ${project.inceptionYear} ${owner} | ||||||
|  |  | ||||||
| Licensed under the Apache License, Version 2.0 (the "License"); | Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
| you may not use this file except in compliance with the License. | you may not use this file except in compliance with the License. | ||||||
|   | |||||||
| @@ -3,13 +3,13 @@ | |||||||
|     <parent> |     <parent> | ||||||
|         <artifactId>morphology</artifactId> |         <artifactId>morphology</artifactId> | ||||||
|         <groupId>org.apache.lucene.morphology</groupId> |         <groupId>org.apache.lucene.morphology</groupId> | ||||||
|         <version>1.1</version> |         <version>1.2-SNAPSHOT</version> | ||||||
|     </parent> |     </parent> | ||||||
|     <modelVersion>4.0.0</modelVersion> |     <modelVersion>4.0.0</modelVersion> | ||||||
|     <groupId>org.apache.lucene.morphology</groupId> |     <groupId>org.apache.lucene.morphology</groupId> | ||||||
|     <artifactId>morph</artifactId> |     <artifactId>morph</artifactId> | ||||||
|     <name>morph</name> |     <name>morph</name> | ||||||
|     <version>1.1</version> |     <version>1.2-SNAPSHOT</version> | ||||||
|     <url>http://maven.apache.org</url> |     <url>http://maven.apache.org</url> | ||||||
|  |  | ||||||
| </project> | </project> | ||||||
|   | |||||||
| @@ -51,25 +51,6 @@ public class MorphologyAnalyzer extends Analyzer { | |||||||
|     protected TokenStreamComponents createComponents(String s) { |     protected TokenStreamComponents createComponents(String s) { | ||||||
|  |  | ||||||
|         StandardTokenizer src = new StandardTokenizer(); |         StandardTokenizer src = new StandardTokenizer(); | ||||||
|         final PayloadEncoder encoder = new PayloadEncoder() { |  | ||||||
|             @Override |  | ||||||
|             public BytesRef encode(char[] buffer) { |  | ||||||
|                 final Float payload = Float.valueOf(new String(buffer)); |  | ||||||
|                 System.out.println(payload); |  | ||||||
|                 final byte[] bytes = PayloadHelper.encodeFloat(payload); |  | ||||||
|                 return new BytesRef(bytes, 0, bytes.length); |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             @Override |  | ||||||
|             public BytesRef encode(char[] buffer, int offset, int length) { |  | ||||||
|  |  | ||||||
|                 final Float payload = Float.valueOf(new String(buffer, offset, length)); |  | ||||||
|                 System.out.println(payload); |  | ||||||
|                 final byte[] bytes = PayloadHelper.encodeFloat(payload); |  | ||||||
|  |  | ||||||
|                 return new BytesRef(bytes, 0, bytes.length); |  | ||||||
|             } |  | ||||||
|         }; |  | ||||||
|         TokenFilter filter = new StandardFilter(src); |         TokenFilter filter = new StandardFilter(src); | ||||||
|         filter = new LowerCaseFilter(filter); |         filter = new LowerCaseFilter(filter); | ||||||
|         filter = new MorphologyFilter(filter, luceneMorph); |         filter = new MorphologyFilter(filter, luceneMorph); | ||||||
|   | |||||||
							
								
								
									
										81
									
								
								pom.xml
									
									
									
									
									
								
							
							
						
						
									
										81
									
								
								pom.xml
									
									
									
									
									
								
							| @@ -1,10 +1,11 @@ | |||||||
| <?xml version="1.0" encoding="UTF-8"?> | <?xml version="1.0" encoding="UTF-8"?> | ||||||
| <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||||||
|  |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> | ||||||
|     <modelVersion>4.0.0</modelVersion> |     <modelVersion>4.0.0</modelVersion> | ||||||
|     <groupId>org.apache.lucene.morphology</groupId> |     <groupId>org.apache.lucene.morphology</groupId> | ||||||
|     <artifactId>morphology</artifactId> |     <artifactId>morphology</artifactId> | ||||||
|     <packaging>pom</packaging> |     <packaging>pom</packaging> | ||||||
|     <version>1.1</version> |     <version>1.2-SNAPSHOT</version> | ||||||
|     <name>morphology</name> |     <name>morphology</name> | ||||||
|     <url>http://maven.apache.org</url> |     <url>http://maven.apache.org</url> | ||||||
|  |  | ||||||
| @@ -12,7 +13,7 @@ | |||||||
|         <connection>scm:git:https://github.com/AKuznetsov/russianmorphology.git</connection> |         <connection>scm:git:https://github.com/AKuznetsov/russianmorphology.git</connection> | ||||||
|         <developerConnection>scm:git:git@github.com:AKuznetsov/russianmorphology.git</developerConnection> |         <developerConnection>scm:git:git@github.com:AKuznetsov/russianmorphology.git</developerConnection> | ||||||
|         <url>https://github.com/AKuznetsov/russianmorphology</url> |         <url>https://github.com/AKuznetsov/russianmorphology</url> | ||||||
|         <tag>morphology-1.1</tag> |         <tag>HEAD</tag> | ||||||
|     </scm> |     </scm> | ||||||
|  |  | ||||||
|     <distributionManagement> |     <distributionManagement> | ||||||
| @@ -65,19 +66,7 @@ | |||||||
|         </repository> |         </repository> | ||||||
|     </repositories> |     </repositories> | ||||||
|  |  | ||||||
|     <pluginRepositories> |  | ||||||
|         <pluginRepository> |  | ||||||
|             <id>mc-release</id> |  | ||||||
|             <name>maven-license-plugin repository of releases</name> |  | ||||||
|             <url>http://mc-repo.googlecode.com/svn/maven2/releases</url> |  | ||||||
|             <snapshots> |  | ||||||
|                 <enabled>false</enabled> |  | ||||||
|             </snapshots> |  | ||||||
|             <releases> |  | ||||||
|                 <enabled>true</enabled> |  | ||||||
|             </releases> |  | ||||||
|         </pluginRepository> |  | ||||||
|     </pluginRepositories> |  | ||||||
|     <build> |     <build> | ||||||
|         <plugins> |         <plugins> | ||||||
|             <plugin> |             <plugin> | ||||||
| @@ -94,34 +83,41 @@ | |||||||
|                 <groupId>org.apache.maven.plugins</groupId> |                 <groupId>org.apache.maven.plugins</groupId> | ||||||
|                 <artifactId>maven-compiler-plugin</artifactId> |                 <artifactId>maven-compiler-plugin</artifactId> | ||||||
|                 <configuration> |                 <configuration> | ||||||
|                     <source>1.5</source> |                     <source>1.7</source> | ||||||
|                     <target>1.5</target> |                     <target>1.7</target> | ||||||
|                 </configuration> |                 </configuration> | ||||||
|             </plugin> |             </plugin> | ||||||
|             <plugin>                <!--                 usage: http://code.google.com/p/maven-license-plugin/wiki/HowTo                --> |             <!--<plugin>                <!–                 usage: http://code.google.com/p/maven-license-plugin/wiki/HowTo                –>--> | ||||||
|                 <artifactId>maven-license-plugin</artifactId> |  | ||||||
|                 <groupId>com.mathieucarbou.mojo</groupId> |                 <!--<groupId>com.mycila</groupId>--> | ||||||
|                 <configuration> |                 <!--<artifactId>license-maven-plugin</artifactId>--> | ||||||
|                     <basedir>${project.parent.basedir}</basedir> |                 <!--<version>2.11</version>--> | ||||||
|                     <header>etc/header.txt</header> |  | ||||||
|                     <excludes> |                 <!--<configuration>--> | ||||||
|                         <exclude>**/*.txt</exclude> |                     <!--<properties>--> | ||||||
|                         <exclude>**/*.info</exclude> |                         <!--<owner>Alexander Kuznetsov</owner>--> | ||||||
|                         <exclude>**/pom.xml</exclude> |                         <!--<!–<email>mathieu.carbou@gmail.com</email>–>--> | ||||||
|                     </excludes> |                     <!--</properties>--> | ||||||
|                     <includes> |                     <!--<basedir>${project.parent.basedir}</basedir>--> | ||||||
|                         <include>**/src/**</include> |                     <!--<header>etc/header.txt</header>--> | ||||||
|                     </includes> |                     <!--<excludes>--> | ||||||
|                 </configuration> |                         <!--<exclude>**/*.txt</exclude>--> | ||||||
|                 <executions> |                         <!--<exclude>**/*.info</exclude>--> | ||||||
|                     <execution> |                         <!--<exclude>**/pom.xml</exclude>--> | ||||||
|                         <phase>test</phase> |                     <!--</excludes>--> | ||||||
|                         <goals> |                     <!--<includes>--> | ||||||
|                             <goal>check</goal> |                         <!--<include>**/src/**</include>--> | ||||||
|                         </goals> |                     <!--</includes>--> | ||||||
|                     </execution> |                 <!--</configuration>--> | ||||||
|                 </executions> |                 <!--<executions>--> | ||||||
|             </plugin> |                     <!--<execution>--> | ||||||
|  |                         <!--<phase>test</phase>--> | ||||||
|  |                         <!--<goals>--> | ||||||
|  |                             <!--<goal>check</goal>--> | ||||||
|  |                         <!--</goals>--> | ||||||
|  |                     <!--</execution>--> | ||||||
|  |                 <!--</executions>--> | ||||||
|  |             <!--</plugin>--> | ||||||
|         </plugins> |         </plugins> | ||||||
|     </build> |     </build> | ||||||
|     <profiles> |     <profiles> | ||||||
| @@ -160,5 +156,6 @@ | |||||||
|         <module>dictionary-reader</module> |         <module>dictionary-reader</module> | ||||||
|         <module>russian</module> |         <module>russian</module> | ||||||
|         <module>english</module> |         <module>english</module> | ||||||
|  |         <module>context</module> | ||||||
|     </modules> |     </modules> | ||||||
| </project> | </project> | ||||||
| @@ -3,13 +3,13 @@ | |||||||
|     <parent> |     <parent> | ||||||
|         <artifactId>morphology</artifactId> |         <artifactId>morphology</artifactId> | ||||||
|         <groupId>org.apache.lucene.morphology</groupId> |         <groupId>org.apache.lucene.morphology</groupId> | ||||||
|         <version>1.1</version> |         <version>1.2-SNAPSHOT</version> | ||||||
|     </parent> |     </parent> | ||||||
|     <modelVersion>4.0.0</modelVersion> |     <modelVersion>4.0.0</modelVersion> | ||||||
|     <groupId>org.apache.lucene.morphology</groupId> |     <groupId>org.apache.lucene.morphology</groupId> | ||||||
|     <artifactId>russian</artifactId> |     <artifactId>russian</artifactId> | ||||||
|     <name>russian</name> |     <name>russian</name> | ||||||
|     <version>1.1</version> |     <version>1.2-SNAPSHOT</version> | ||||||
|     <url>http://maven.apache.org</url> |     <url>http://maven.apache.org</url> | ||||||
|     <dependencies> |     <dependencies> | ||||||
|  |  | ||||||
| @@ -17,7 +17,7 @@ | |||||||
|         <dependency> |         <dependency> | ||||||
|             <groupId>org.apache.lucene.morphology</groupId> |             <groupId>org.apache.lucene.morphology</groupId> | ||||||
|             <artifactId>morph</artifactId> |             <artifactId>morph</artifactId> | ||||||
|             <version>1.1</version> |             <version>1.2-SNAPSHOT</version> | ||||||
|         </dependency> |         </dependency> | ||||||
|  |  | ||||||
|         <dependency> |         <dependency> | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user