Compare commits
	
		
			2 Commits
		
	
	
		
			master
			...
			ambiguousl
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | f095cbe7c0 | ||
|   | 3b2e48821a | 
							
								
								
									
										35
									
								
								.github/workflows/main.yaml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										35
									
								
								.github/workflows/main.yaml
									
									
									
									
										vendored
									
									
								
							| @@ -1,35 +0,0 @@ | ||||
| name: Java CI | ||||
|  | ||||
| on: [push, pull_request] | ||||
|  | ||||
| jobs: | ||||
|   tests: | ||||
|     runs-on: ubuntu-latest | ||||
|  | ||||
|     steps: | ||||
|       - uses: actions/checkout@v2 | ||||
|       - name: Set up JDK 11 | ||||
|         uses: actions/setup-java@v2 | ||||
|         with: | ||||
|           java-version: '11' | ||||
|           distribution: 'adopt' | ||||
|       - name: Build with Maven | ||||
|         run: mvn --batch-mode --update-snapshots verify | ||||
|            | ||||
|   pack-artifacts: | ||||
|     runs-on: ubuntu-latest | ||||
|     needs: tests | ||||
|     if: github.ref == 'refs/heads/master' | ||||
|     steps: | ||||
|       - uses: actions/checkout@v2 | ||||
|       - name: Set up JDK 11 | ||||
|         uses: actions/setup-java@v2 | ||||
|         with: | ||||
|           java-version: '11' | ||||
|           distribution: 'adopt' | ||||
|       - name: Build with Maven | ||||
|         run: mvn --batch-mode --update-snapshots verify | ||||
|       - uses: actions/upload-artifact@v2 | ||||
|         with: | ||||
|           name: artifacts | ||||
|           path: ${{ github.workspace }}/*/target/*.jar | ||||
							
								
								
									
										202
									
								
								LICENSE
									
									
									
									
									
								
							
							
						
						
									
										202
									
								
								LICENSE
									
									
									
									
									
								
							| @@ -1,202 +0,0 @@ | ||||
|  | ||||
|                                  Apache License | ||||
|                            Version 2.0, January 2004 | ||||
|                         http://www.apache.org/licenses/ | ||||
|  | ||||
|    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION | ||||
|  | ||||
|    1. Definitions. | ||||
|  | ||||
|       "License" shall mean the terms and conditions for use, reproduction, | ||||
|       and distribution as defined by Sections 1 through 9 of this document. | ||||
|  | ||||
|       "Licensor" shall mean the copyright owner or entity authorized by | ||||
|       the copyright owner that is granting the License. | ||||
|  | ||||
|       "Legal Entity" shall mean the union of the acting entity and all | ||||
|       other entities that control, are controlled by, or are under common | ||||
|       control with that entity. For the purposes of this definition, | ||||
|       "control" means (i) the power, direct or indirect, to cause the | ||||
|       direction or management of such entity, whether by contract or | ||||
|       otherwise, or (ii) ownership of fifty percent (50%) or more of the | ||||
|       outstanding shares, or (iii) beneficial ownership of such entity. | ||||
|  | ||||
|       "You" (or "Your") shall mean an individual or Legal Entity | ||||
|       exercising permissions granted by this License. | ||||
|  | ||||
|       "Source" form shall mean the preferred form for making modifications, | ||||
|       including but not limited to software source code, documentation | ||||
|       source, and configuration files. | ||||
|  | ||||
|       "Object" form shall mean any form resulting from mechanical | ||||
|       transformation or translation of a Source form, including but | ||||
|       not limited to compiled object code, generated documentation, | ||||
|       and conversions to other media types. | ||||
|  | ||||
|       "Work" shall mean the work of authorship, whether in Source or | ||||
|       Object form, made available under the License, as indicated by a | ||||
|       copyright notice that is included in or attached to the work | ||||
|       (an example is provided in the Appendix below). | ||||
|  | ||||
|       "Derivative Works" shall mean any work, whether in Source or Object | ||||
|       form, that is based on (or derived from) the Work and for which the | ||||
|       editorial revisions, annotations, elaborations, or other modifications | ||||
|       represent, as a whole, an original work of authorship. For the purposes | ||||
|       of this License, Derivative Works shall not include works that remain | ||||
|       separable from, or merely link (or bind by name) to the interfaces of, | ||||
|       the Work and Derivative Works thereof. | ||||
|  | ||||
|       "Contribution" shall mean any work of authorship, including | ||||
|       the original version of the Work and any modifications or additions | ||||
|       to that Work or Derivative Works thereof, that is intentionally | ||||
|       submitted to Licensor for inclusion in the Work by the copyright owner | ||||
|       or by an individual or Legal Entity authorized to submit on behalf of | ||||
|       the copyright owner. For the purposes of this definition, "submitted" | ||||
|       means any form of electronic, verbal, or written communication sent | ||||
|       to the Licensor or its representatives, including but not limited to | ||||
|       communication on electronic mailing lists, source code control systems, | ||||
|       and issue tracking systems that are managed by, or on behalf of, the | ||||
|       Licensor for the purpose of discussing and improving the Work, but | ||||
|       excluding communication that is conspicuously marked or otherwise | ||||
|       designated in writing by the copyright owner as "Not a Contribution." | ||||
|  | ||||
|       "Contributor" shall mean Licensor and any individual or Legal Entity | ||||
|       on behalf of whom a Contribution has been received by Licensor and | ||||
|       subsequently incorporated within the Work. | ||||
|  | ||||
|    2. Grant of Copyright License. Subject to the terms and conditions of | ||||
|       this License, each Contributor hereby grants to You a perpetual, | ||||
|       worldwide, non-exclusive, no-charge, royalty-free, irrevocable | ||||
|       copyright license to reproduce, prepare Derivative Works of, | ||||
|       publicly display, publicly perform, sublicense, and distribute the | ||||
|       Work and such Derivative Works in Source or Object form. | ||||
|  | ||||
|    3. Grant of Patent License. Subject to the terms and conditions of | ||||
|       this License, each Contributor hereby grants to You a perpetual, | ||||
|       worldwide, non-exclusive, no-charge, royalty-free, irrevocable | ||||
|       (except as stated in this section) patent license to make, have made, | ||||
|       use, offer to sell, sell, import, and otherwise transfer the Work, | ||||
|       where such license applies only to those patent claims licensable | ||||
|       by such Contributor that are necessarily infringed by their | ||||
|       Contribution(s) alone or by combination of their Contribution(s) | ||||
|       with the Work to which such Contribution(s) was submitted. If You | ||||
|       institute patent litigation against any entity (including a | ||||
|       cross-claim or counterclaim in a lawsuit) alleging that the Work | ||||
|       or a Contribution incorporated within the Work constitutes direct | ||||
|       or contributory patent infringement, then any patent licenses | ||||
|       granted to You under this License for that Work shall terminate | ||||
|       as of the date such litigation is filed. | ||||
|  | ||||
|    4. Redistribution. You may reproduce and distribute copies of the | ||||
|       Work or Derivative Works thereof in any medium, with or without | ||||
|       modifications, and in Source or Object form, provided that You | ||||
|       meet the following conditions: | ||||
|  | ||||
|       (a) You must give any other recipients of the Work or | ||||
|           Derivative Works a copy of this License; and | ||||
|  | ||||
|       (b) You must cause any modified files to carry prominent notices | ||||
|           stating that You changed the files; and | ||||
|  | ||||
|       (c) You must retain, in the Source form of any Derivative Works | ||||
|           that You distribute, all copyright, patent, trademark, and | ||||
|           attribution notices from the Source form of the Work, | ||||
|           excluding those notices that do not pertain to any part of | ||||
|           the Derivative Works; and | ||||
|  | ||||
|       (d) If the Work includes a "NOTICE" text file as part of its | ||||
|           distribution, then any Derivative Works that You distribute must | ||||
|           include a readable copy of the attribution notices contained | ||||
|           within such NOTICE file, excluding those notices that do not | ||||
|           pertain to any part of the Derivative Works, in at least one | ||||
|           of the following places: within a NOTICE text file distributed | ||||
|           as part of the Derivative Works; within the Source form or | ||||
|           documentation, if provided along with the Derivative Works; or, | ||||
|           within a display generated by the Derivative Works, if and | ||||
|           wherever such third-party notices normally appear. The contents | ||||
|           of the NOTICE file are for informational purposes only and | ||||
|           do not modify the License. You may add Your own attribution | ||||
|           notices within Derivative Works that You distribute, alongside | ||||
|           or as an addendum to the NOTICE text from the Work, provided | ||||
|           that such additional attribution notices cannot be construed | ||||
|           as modifying the License. | ||||
|  | ||||
|       You may add Your own copyright statement to Your modifications and | ||||
|       may provide additional or different license terms and conditions | ||||
|       for use, reproduction, or distribution of Your modifications, or | ||||
|       for any such Derivative Works as a whole, provided Your use, | ||||
|       reproduction, and distribution of the Work otherwise complies with | ||||
|       the conditions stated in this License. | ||||
|  | ||||
|    5. Submission of Contributions. Unless You explicitly state otherwise, | ||||
|       any Contribution intentionally submitted for inclusion in the Work | ||||
|       by You to the Licensor shall be under the terms and conditions of | ||||
|       this License, without any additional terms or conditions. | ||||
|       Notwithstanding the above, nothing herein shall supersede or modify | ||||
|       the terms of any separate license agreement you may have executed | ||||
|       with Licensor regarding such Contributions. | ||||
|  | ||||
|    6. Trademarks. This License does not grant permission to use the trade | ||||
|       names, trademarks, service marks, or product names of the Licensor, | ||||
|       except as required for reasonable and customary use in describing the | ||||
|       origin of the Work and reproducing the content of the NOTICE file. | ||||
|  | ||||
|    7. Disclaimer of Warranty. Unless required by applicable law or | ||||
|       agreed to in writing, Licensor provides the Work (and each | ||||
|       Contributor provides its Contributions) on an "AS IS" BASIS, | ||||
|       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | ||||
|       implied, including, without limitation, any warranties or conditions | ||||
|       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A | ||||
|       PARTICULAR PURPOSE. You are solely responsible for determining the | ||||
|       appropriateness of using or redistributing the Work and assume any | ||||
|       risks associated with Your exercise of permissions under this License. | ||||
|  | ||||
|    8. Limitation of Liability. In no event and under no legal theory, | ||||
|       whether in tort (including negligence), contract, or otherwise, | ||||
|       unless required by applicable law (such as deliberate and grossly | ||||
|       negligent acts) or agreed to in writing, shall any Contributor be | ||||
|       liable to You for damages, including any direct, indirect, special, | ||||
|       incidental, or consequential damages of any character arising as a | ||||
|       result of this License or out of the use or inability to use the | ||||
|       Work (including but not limited to damages for loss of goodwill, | ||||
|       work stoppage, computer failure or malfunction, or any and all | ||||
|       other commercial damages or losses), even if such Contributor | ||||
|       has been advised of the possibility of such damages. | ||||
|  | ||||
|    9. Accepting Warranty or Additional Liability. While redistributing | ||||
|       the Work or Derivative Works thereof, You may choose to offer, | ||||
|       and charge a fee for, acceptance of support, warranty, indemnity, | ||||
|       or other liability obligations and/or rights consistent with this | ||||
|       License. However, in accepting such obligations, You may act only | ||||
|       on Your own behalf and on Your sole responsibility, not on behalf | ||||
|       of any other Contributor, and only if You agree to indemnify, | ||||
|       defend, and hold each Contributor harmless for any liability | ||||
|       incurred by, or claims asserted against, such Contributor by reason | ||||
|       of your accepting any such warranty or additional liability. | ||||
|  | ||||
|    END OF TERMS AND CONDITIONS | ||||
|  | ||||
|    APPENDIX: How to apply the Apache License to your work. | ||||
|  | ||||
|       To apply the Apache License to your work, attach the following | ||||
|       boilerplate notice, with the fields enclosed by brackets "[]" | ||||
|       replaced with your own identifying information. (Don't include | ||||
|       the brackets!)  The text should be enclosed in the appropriate | ||||
|       comment syntax for the file format. We also recommend that a | ||||
|       file or class name and description of purpose be included on the | ||||
|       same "printed page" as the copyright notice for easier | ||||
|       identification within third-party archives. | ||||
|  | ||||
|    Copyright [yyyy] [name of copyright owner] | ||||
|  | ||||
|    Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|    you may not use this file except in compliance with the License. | ||||
|    You may obtain a copy of the License at | ||||
|  | ||||
|        http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
|    Unless required by applicable law or agreed to in writing, software | ||||
|    distributed under the License is distributed on an "AS IS" BASIS, | ||||
|    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|    See the License for the specific language governing permissions and | ||||
|    limitations under the License. | ||||
							
								
								
									
										64
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										64
									
								
								README.md
									
									
									
									
									
								
							| @@ -1,17 +1,54 @@ | ||||
| # Russian Morphology for Apache Lucene | ||||
| # Russian Morphology for lucene | ||||
|  | ||||
| Russian and English morphology for Java and [Apache Lucene](http://lucene.apache.org) 9.3 framework based on open source dictionary from site [АОТ](http://aot.ru). It uses dictionary base morphology with some heuristics for unknown words. It supports a homonym for example for a Russian word "вина" it gives two variants "вино" and "вина". | ||||
| Russian and English morphology for java and lucene 3.0 framework based on open source dictionary from site [АОТ](http://aot.ru). It use dictionary base morphology with some heuristics for unknown words. It support homonym for example for Russian word "вина" it gives two variants "вино" and "вина".  | ||||
|  | ||||
|  | ||||
| ### How to use | ||||
|  | ||||
| Build project, by running `mvn clean package`, this will provide you the latest versions of the artifacts - 1.5, add it to your classpath. You could select which version to use - Russian or English. | ||||
| First download  | ||||
| [morph-1.0.jar](https://bintray.com/artifact/download/akuznetsov/russianmorphology/org/apache/lucene/morphology/morph/1.1/morph-1.1.jar)   | ||||
| and add it to your class path. When download [Russian](https://bintray.com/artifact/download/akuznetsov/russianmorphology/org/apache/lucene/morphology/russian/1.1/russian-1.1.jar) or  | ||||
| [English](https://bintray.com/artifact/download/akuznetsov/russianmorphology/org/apache/lucene/morphology/english/1.1/english-1.1.jar) package.  | ||||
|  | ||||
| If you use maven you can add dependency  | ||||
|  | ||||
|         <dependency> | ||||
|             <groupId>org.apache.lucene.morphology</groupId> | ||||
|             <artifactId>russian</artifactId> | ||||
|             <version>1.1</version> | ||||
|         </dependency> | ||||
|  | ||||
|  | ||||
|         <dependency> | ||||
|             <groupId>org.apache.lucene.morphology</groupId> | ||||
|             <artifactId>english</artifactId> | ||||
|             <version>1.1</version> | ||||
|         </dependency> | ||||
|  | ||||
| Don't forget add link to repository | ||||
|  | ||||
|  | ||||
|     <repositories> | ||||
|     ............... | ||||
|       <repository> | ||||
|         <snapshots> | ||||
|           <enabled>false</enabled> | ||||
|         </snapshots> | ||||
|         <id>bintray-akuznetsov-russianmorphology</id> | ||||
|         <name>bintray</name> | ||||
|         <url>http://dl.bintray.com/akuznetsov/russianmorphology</url> | ||||
|       </repository> | ||||
|     </repositories> | ||||
|  | ||||
|  | ||||
|  | ||||
| Now you can create a Lucene Analyzer  | ||||
|  | ||||
| Now you can create a Lucene Analyzer: | ||||
|  | ||||
|       RussianAnalayzer russian = new RussianAnalayzer(); | ||||
|       EnglishAnalayzer english = new EnglishAnalayzer(); | ||||
|  | ||||
|  | ||||
| You can write you own analyzer using filter that convert word in it's right forms.  | ||||
|  | ||||
|       LuceneMorphology luceneMorph = new EnglishLuceneMorphology(); | ||||
| @@ -25,28 +62,9 @@ Also if you need get a list of base forms of word, you can use following example | ||||
|      LuceneMorphology luceneMorph = new EnglishLuceneMorphology(); | ||||
|      List<String> wordBaseForms = luceneMorph.getMorphInfo(word); | ||||
|  | ||||
| ### Solr | ||||
|  | ||||
| You can use the LuceneMorphology as morphology filter in a Solr _schema.xml_ using a **MorphologyFilterFactory:** | ||||
|  | ||||
| ```xml | ||||
| <fieldType name="content" class="solr.TextField" positionIncrementGap="100"> | ||||
|       <analyzer> | ||||
|         <tokenizer class="solr.StandardTokenizerFactory"/> | ||||
| 		<filter class="org.apache.lucene.analysis.morphology.MorphologyFilterFactory" language="Russian"/> | ||||
| 		<filter class="org.apache.lucene.analysis.morphology.MorphologyFilterFactory" language="English"/> | ||||
|       </analyzer> | ||||
| </fieldType> | ||||
| ``` | ||||
|  | ||||
| Just add _morphology-1.5.jar_ in your Solr lib-directories | ||||
|  | ||||
| ### Restrictions | ||||
|    | ||||
|   * It works only with UTF-8. | ||||
|   * It assume what letters е and ё are the same. | ||||
|   * Word forms with prefixes like "наибольший" treated as separate word.  | ||||
|  | ||||
| ### License | ||||
|  | ||||
| Apache License, Version 2.0 | ||||
|   | ||||
| @@ -1,40 +1,36 @@ | ||||
| <?xml version="1.0" encoding="UTF-8"?> | ||||
| <project xmlns="http://maven.apache.org/POM/4.0.0" | ||||
|          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||||
|          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||||
|     <parent> | ||||
|         <artifactId>morphology</artifactId> | ||||
|         <groupId>org.apache.lucene.morphology</groupId> | ||||
|         <version>1.5</version> | ||||
|     </parent> | ||||
| <?xml version="1.0"?> | ||||
| <project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" | ||||
|          xmlns="http://maven.apache.org/POM/4.0.0" | ||||
|          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> | ||||
|     <modelVersion>4.0.0</modelVersion> | ||||
| 
 | ||||
|     <groupId>org.apache.lucene.analysis</groupId> | ||||
|     <parent> | ||||
|         <groupId>org.apache.lucene.morphology</groupId> | ||||
|         <artifactId>morphology</artifactId> | ||||
|     <name>solr-morphology-analysis</name> | ||||
|     <version>${morphology.version}</version> | ||||
|         <version>1.2-SNAPSHOT</version> | ||||
|     </parent> | ||||
|     <groupId>org.apache.lucene.morphology</groupId> | ||||
|     <artifactId>context</artifactId> | ||||
|     <version>1.0-SNAPSHOT</version> | ||||
|     <name>context</name> | ||||
|     <url>http://maven.apache.org</url> | ||||
| 
 | ||||
|     <dependencies> | ||||
| 
 | ||||
|         <dependency> | ||||
|             <groupId>junit</groupId> | ||||
|             <artifactId>junit</artifactId> | ||||
|             <version>4.8.2</version> | ||||
|             <scope>test</scope> | ||||
|         </dependency> | ||||
|         <dependency> | ||||
|             <groupId>org.apache.lucene.morphology</groupId> | ||||
|             <artifactId>russian</artifactId> | ||||
|             <version>${morphology.version}</version> | ||||
|             <version>1.2-SNAPSHOT</version> | ||||
|             <scope>test</scope> | ||||
|         </dependency> | ||||
|         <dependency> | ||||
|             <groupId>org.apache.lucene.morphology</groupId> | ||||
|             <artifactId>english</artifactId> | ||||
|             <version>${morphology.version}</version> | ||||
|         </dependency> | ||||
| 
 | ||||
|         <dependency> | ||||
|             <groupId>junit</groupId> | ||||
|             <artifactId>junit</artifactId> | ||||
|             <version>${junit.version}</version> | ||||
|             <version>1.2-SNAPSHOT</version> | ||||
|             <scope>test</scope> | ||||
|         </dependency> | ||||
| 
 | ||||
|     </dependencies> | ||||
| 
 | ||||
| </project> | ||||
| @@ -0,0 +1,52 @@ | ||||
| /** | ||||
|  * Copyright 2015 Alexander Kuznetsov | ||||
|  * | ||||
|  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|  * you may not use this file except in compliance with the License. | ||||
|  * You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.lucene.morphology.context; | ||||
|  | ||||
| import org.apache.lucene.analysis.Analyzer; | ||||
| import org.apache.lucene.analysis.TokenStream; | ||||
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||||
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | ||||
|  | ||||
| import java.io.ByteArrayInputStream; | ||||
| import java.io.IOException; | ||||
| import java.io.InputStreamReader; | ||||
| import java.util.ArrayList; | ||||
| import java.util.LinkedList; | ||||
| import java.util.List; | ||||
|  | ||||
| public class CalculateContextItem { | ||||
|  | ||||
|     public List<ContextItem> createContextItems(String text) throws IOException { | ||||
|         Analyzer statAnalyzer = new StatAnalyzer(); | ||||
|         InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год? - и что я жду тебя, где вино".getBytes()), "UTF-8"); | ||||
|  | ||||
|  | ||||
| //        new RussianMorphology(); | ||||
|  | ||||
|         TokenStream tokenStream = statAnalyzer.tokenStream(null, reader); | ||||
|         tokenStream.reset(); | ||||
|  | ||||
|         List<List<String>> listedLink =  new LinkedList<>(); | ||||
|         while (tokenStream.incrementToken()) { | ||||
|             CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); | ||||
|             PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class); | ||||
|  | ||||
|  | ||||
|         } | ||||
|  | ||||
|         return null; | ||||
|     } | ||||
| } | ||||
| @@ -0,0 +1,80 @@ | ||||
| /** | ||||
|  * Copyright 2015 Alexander Kuznetsov | ||||
|  * | ||||
|  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|  * you may not use this file except in compliance with the License. | ||||
|  * You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.lucene.morphology.context; | ||||
|  | ||||
| import java.util.Arrays; | ||||
|  | ||||
| public class ContextItem implements Comparable<ContextItem> { | ||||
|     String[][] morphInfo; | ||||
|  | ||||
|     public ContextItem(String[][] morphInfo) { | ||||
|         this.morphInfo = morphInfo; | ||||
|     } | ||||
|  | ||||
|     public String[][] getMorphInfo() { | ||||
|         return morphInfo; | ||||
|     } | ||||
|  | ||||
|     public void setMorphInfo(String[][] morphInfo) { | ||||
|         this.morphInfo = morphInfo; | ||||
|     } | ||||
|  | ||||
|     public int hashCode() { | ||||
|         int h = 0; | ||||
|         for (String[] m : morphInfo) { | ||||
|             for (String s : m) { | ||||
|                 h = 31 * h + s.hashCode(); | ||||
|             } | ||||
|         } | ||||
|         return h; | ||||
|     } | ||||
|  | ||||
|  | ||||
|     @Override | ||||
|     public boolean equals(Object o) { | ||||
|         if (this == o) return true; | ||||
|         if (o == null || getClass() != o.getClass()) return false; | ||||
|  | ||||
|         ContextItem that = (ContextItem) o; | ||||
|  | ||||
|         if (that.morphInfo.length != this.morphInfo.length) { | ||||
|             return false; | ||||
|         } | ||||
|         for (int i = 0; i < morphInfo.length; i++) { | ||||
|             if (!Arrays.equals(morphInfo[i], that.morphInfo[i])) { | ||||
|                 return false; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         return true; | ||||
|     } | ||||
|  | ||||
|  | ||||
|     @Override | ||||
|     public int compareTo(ContextItem o) { | ||||
|         int i = o.morphInfo.length - morphInfo.length; | ||||
|         if (i != 0) return i; | ||||
|         for (int j = 0; j < morphInfo.length; j++) { | ||||
|             i = o.morphInfo[j].length - morphInfo[j].length; | ||||
|             if (i != 0) return i; | ||||
|             for (int k = 0; k < morphInfo[j].length; k++) { | ||||
|                 i = morphInfo[j][k].compareTo(o.morphInfo[j][k]); | ||||
|                 if (i != 0) return i; | ||||
|             } | ||||
|         } | ||||
|         return 0; | ||||
|     } | ||||
| } | ||||
| @@ -0,0 +1,37 @@ | ||||
| /** | ||||
|  * Copyright 2015 Alexander Kuznetsov | ||||
|  * | ||||
|  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|  * you may not use this file except in compliance with the License. | ||||
|  * You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.lucene.morphology.context; | ||||
|  | ||||
| public class ContextStats { | ||||
|     String[] morphInfo; | ||||
|     double prob; | ||||
|  | ||||
|     public String[] getMorphInfo() { | ||||
|         return morphInfo; | ||||
|     } | ||||
|  | ||||
|     public void setMorphInfo(String[] morphInfo) { | ||||
|         this.morphInfo = morphInfo; | ||||
|     } | ||||
|  | ||||
|     public double getProb() { | ||||
|         return prob; | ||||
|     } | ||||
|  | ||||
|     public void setProb(double prob) { | ||||
|         this.prob = prob; | ||||
|     } | ||||
| } | ||||
| @@ -0,0 +1,21 @@ | ||||
| /** | ||||
|  * Copyright 2015 Alexander Kuznetsov | ||||
|  * | ||||
|  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|  * you may not use this file except in compliance with the License. | ||||
|  * You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.lucene.morphology.context; | ||||
|  | ||||
| public class ProbClalucator { | ||||
|  | ||||
|  | ||||
| } | ||||
| @@ -0,0 +1,116 @@ | ||||
| /** | ||||
|  * Copyright 2015 Alexander Kuznetsov | ||||
|  * | ||||
|  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|  * you may not use this file except in compliance with the License. | ||||
|  * You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.lucene.morphology.context; | ||||
|  | ||||
| import org.apache.lucene.analysis.Tokenizer; | ||||
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||||
| import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl; | ||||
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | ||||
| import org.apache.lucene.util.AttributeFactory; | ||||
|  | ||||
| import java.io.BufferedReader; | ||||
| import java.io.IOException; | ||||
| import java.util.Arrays; | ||||
| import java.util.HashSet; | ||||
| import java.util.LinkedList; | ||||
| import java.util.Set; | ||||
|  | ||||
| public class SimpleTokenizer extends Tokenizer { | ||||
|  | ||||
|     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); | ||||
|     private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class); | ||||
|     LinkedList<String> terms; | ||||
|  | ||||
|     public final static Set<Character> SEPARATION_LETTERS = new HashSet<>(Arrays.asList(' ', '(', ')', ',', '|', '\t', | ||||
|             '\n', '"', ':', '!', '?', ',', ';', '•')); | ||||
|  | ||||
|     public final static Set<Character> MEANING_CHARS = new HashSet<>(Arrays.asList('(', ')', ',', '|', | ||||
|             '"', ':', '!', '?', ',', ';', '•', '.')); | ||||
|  | ||||
|     public SimpleTokenizer() { | ||||
|     } | ||||
|  | ||||
|     public SimpleTokenizer(AttributeFactory factory) { | ||||
|         super(factory); | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     final public boolean incrementToken() throws IOException { | ||||
|         if (terms == null) { | ||||
|             createTeams(); | ||||
|         } | ||||
|         if (terms.size() > 0) { | ||||
|             String str = terms.poll(); | ||||
|             termAtt.setEmpty(); | ||||
|             termAtt.append(str); | ||||
|             posAtt.setPositionIncrement(1); | ||||
|             return true; | ||||
|         } | ||||
|         return false; | ||||
|     } | ||||
|  | ||||
|     private void createTeams() throws IOException { | ||||
|         terms = new LinkedList<>(); | ||||
|  | ||||
|         BufferedReader br = new BufferedReader(input); | ||||
|         StringBuilder sb = new StringBuilder(); | ||||
|         String s = ""; | ||||
|         while ((s = br.readLine()) != null) { | ||||
|             sb.append(s).append(" "); | ||||
|         } | ||||
|  | ||||
|         s = sb.toString(); | ||||
|         CharTermAttributeImpl currentTerm = new CharTermAttributeImpl(); | ||||
|         for (int i = 0; i < s.length(); i++) { | ||||
|             if (checkIsCharSepartor(s, i)) { | ||||
|                 if (checkIsCharHasMeaning(s, i)) { | ||||
|                     terms.add(s.substring(i, i + 1)); | ||||
|                 } | ||||
|                 String term = currentTerm.toString(); | ||||
|                 currentTerm.clear(); | ||||
|                 if (term.length() > 0) { | ||||
|                     terms.add(term); | ||||
|                 } | ||||
|             } else { | ||||
|                 currentTerm.append(s.charAt(i)); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     private boolean checkIsCharHasMeaning(String s, int i) { | ||||
|         return MEANING_CHARS.contains(s.charAt(i)); | ||||
|     } | ||||
|  | ||||
|     private boolean checkIsCharSepartor(String s, int i) { | ||||
|         char c = s.charAt(i); | ||||
|         if (SEPARATION_LETTERS.contains(c)) { | ||||
|             return true; | ||||
|         } | ||||
|         if ('.' == c | ||||
|                 && s.length() > i + 1 | ||||
|                 && SEPARATION_LETTERS.contains(s.charAt(i + 1))) { | ||||
|             return true; | ||||
|         } | ||||
|         return false; | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public void reset() throws IOException { | ||||
|         this.terms = null; | ||||
|         super.reset(); | ||||
|     } | ||||
|  | ||||
| } | ||||
| @@ -0,0 +1,34 @@ | ||||
| package org.apache.lucene.morphology.context; | ||||
|  | ||||
| import org.apache.lucene.analysis.Analyzer; | ||||
| import org.apache.lucene.analysis.TokenFilter; | ||||
| import org.apache.lucene.analysis.core.LowerCaseFilter; | ||||
| import org.apache.lucene.analysis.standard.StandardFilter; | ||||
|  | ||||
| import java.io.IOException; | ||||
| import java.io.Reader; | ||||
|  | ||||
| /** | ||||
|  * Created by akuznetsov on 6/24/15. | ||||
|  */ | ||||
| public class StatAnalyzer extends Analyzer { | ||||
|  | ||||
|  | ||||
|  | ||||
|     @Override | ||||
|     protected TokenStreamComponents createComponents(String s) { | ||||
|  | ||||
|         SimpleTokenizer src = new SimpleTokenizer(); | ||||
|         TokenFilter filter = new StandardFilter(src); | ||||
|         filter = new LowerCaseFilter(filter); | ||||
|  | ||||
|         return new TokenStreamComponents(src, filter) { | ||||
|             @Override | ||||
|             protected void setReader(final Reader reader) throws IOException { | ||||
|                 super.setReader(reader); | ||||
|             } | ||||
|         }; | ||||
|     } | ||||
|  | ||||
|  | ||||
| } | ||||
| @@ -0,0 +1,32 @@ | ||||
| package org.apache.lucene.morphology.context; | ||||
|  | ||||
|  | ||||
| import org.apache.lucene.analysis.Analyzer; | ||||
| import org.apache.lucene.analysis.TokenStream; | ||||
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||||
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | ||||
| import org.junit.Test; | ||||
|  | ||||
| import java.io.ByteArrayInputStream; | ||||
| import java.io.IOException; | ||||
| import java.io.InputStreamReader; | ||||
|  | ||||
| public class SimpleTokenizerTest { | ||||
|  | ||||
|     @Test | ||||
|     public void testSimpleTokenizer() throws IOException { | ||||
|         Analyzer statAnalyzer = new StatAnalyzer(); | ||||
|         InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год? - и что я жду тебя, где вино".getBytes()), "UTF-8"); | ||||
|  | ||||
|         TokenStream tokenStream = statAnalyzer.tokenStream(null, reader); | ||||
|         tokenStream.reset(); | ||||
|  | ||||
|         boolean wordSeen = false; | ||||
|         while (tokenStream.incrementToken()) { | ||||
|             CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); | ||||
|             PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class); | ||||
|             System.out.println(charTerm.toString()); | ||||
|         } | ||||
|     } | ||||
|  | ||||
| } | ||||
| @@ -3,26 +3,27 @@ | ||||
|     <parent> | ||||
|         <artifactId>morphology</artifactId> | ||||
|         <groupId>org.apache.lucene.morphology</groupId> | ||||
|         <version>1.5</version> | ||||
|         <version>1.2-SNAPSHOT</version> | ||||
|     </parent> | ||||
|     <modelVersion>4.0.0</modelVersion> | ||||
|     <groupId>org.apache.lucene.morphology</groupId> | ||||
|     <artifactId>dictionary-reader</artifactId> | ||||
|     <name>dictionary-reader</name> | ||||
|     <version>1.5</version> | ||||
|     <version>1.2-SNAPSHOT</version> | ||||
|     <url>http://maven.apache.org</url> | ||||
|  | ||||
|     <dependencies> | ||||
|         <dependency> | ||||
|             <groupId>org.apache.lucene.morphology</groupId> | ||||
|             <artifactId>russian</artifactId> | ||||
|             <version>1.5</version> | ||||
|             <version>1.2-SNAPSHOT</version> | ||||
|         </dependency> | ||||
|  | ||||
|  | ||||
|         <dependency> | ||||
|             <groupId>org.apache.lucene.morphology</groupId> | ||||
|             <artifactId>english</artifactId> | ||||
|             <version>1.5</version> | ||||
|             <version>1.2-SNAPSHOT</version> | ||||
|         </dependency> | ||||
|     </dependencies> | ||||
|  | ||||
|   | ||||
| @@ -22,19 +22,20 @@ import java.io.FileInputStream; | ||||
| import java.io.IOException; | ||||
| import java.io.InputStreamReader; | ||||
| import java.util.ArrayList; | ||||
| import java.util.HashSet; | ||||
| import java.util.List; | ||||
| import java.util.Set; | ||||
|  | ||||
|  | ||||
| /** | ||||
|  * This class contain logic how read | ||||
|  * dictionary and produce word with it all forms. | ||||
|  * dictonary and produce word with it all forms. | ||||
|  */ | ||||
| public class DictionaryReader { | ||||
|     private String fileName; | ||||
|     private String fileEncoding = "windows-1251"; | ||||
|     private List<List<FlexiaModel>> wordsFlexias = new ArrayList<>(); | ||||
|     private Set<String> ignoredForm; | ||||
|     private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>(); | ||||
|     private Set<String> ignoredForm = new HashSet<String>(); | ||||
|  | ||||
|     public DictionaryReader(String fileName, Set<String> ignoredForm) { | ||||
|         this.fileName = fileName; | ||||
| @@ -54,7 +55,7 @@ public class DictionaryReader { | ||||
|  | ||||
|     private void readWords(BufferedReader reader, WordProcessor wordProcessor) throws IOException { | ||||
|         String s = reader.readLine(); | ||||
|         int count = Integer.parseInt(s); | ||||
|         int count = Integer.valueOf(s); | ||||
|         int actual = 0; | ||||
|         for (int i = 0; i < count; i++) { | ||||
|             s = reader.readLine(); | ||||
| @@ -78,7 +79,7 @@ public class DictionaryReader { | ||||
|         String wordBase = wd[0].toLowerCase(); | ||||
|         if (wordBase.startsWith("-")) return null; | ||||
|         wordBase = "#".equals(wordBase) ? "" : wordBase; | ||||
|         List<FlexiaModel> models = wordsFlexias.get(Integer.parseInt(wd[1])); | ||||
|         List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1])); | ||||
|         FlexiaModel flexiaModel = models.get(0); | ||||
|         if (models.size() == 0 || ignoredForm.contains(flexiaModel.getCode())) { | ||||
|             return null; | ||||
| @@ -95,7 +96,7 @@ public class DictionaryReader { | ||||
|  | ||||
|     private void skipBlock(BufferedReader reader) throws IOException { | ||||
|         String s = reader.readLine(); | ||||
|         int count = Integer.parseInt(s); | ||||
|         int count = Integer.valueOf(s); | ||||
|         for (int i = 0; i < count; i++) { | ||||
|             reader.readLine(); | ||||
|         } | ||||
| @@ -104,7 +105,7 @@ public class DictionaryReader { | ||||
|  | ||||
|     private void readPrefix(BufferedReader reader) throws IOException { | ||||
|         String s = reader.readLine(); | ||||
|         int count = Integer.parseInt(s); | ||||
|         int count = Integer.valueOf(s); | ||||
|         for (int i = 0; i < count; i++) { | ||||
|             reader.readLine(); | ||||
|         } | ||||
| @@ -112,10 +113,10 @@ public class DictionaryReader { | ||||
|  | ||||
|     private void readFlexias(BufferedReader reader) throws IOException { | ||||
|         String s = reader.readLine(); | ||||
|         int count = Integer.parseInt(s); | ||||
|         int count = Integer.valueOf(s); | ||||
|         for (int i = 0; i < count; i++) { | ||||
|             s = reader.readLine(); | ||||
|             ArrayList<FlexiaModel> flexiaModelArrayList = new ArrayList<>(); | ||||
|             ArrayList<FlexiaModel> flexiaModelArrayList = new ArrayList<FlexiaModel>(); | ||||
|             wordsFlexias.add(flexiaModelArrayList); | ||||
|             for (String line : s.split("%")) { | ||||
|                 addFlexia(flexiaModelArrayList, line); | ||||
|   | ||||
| @@ -16,8 +16,6 @@ | ||||
|  | ||||
| package org.apache.lucene.morphology.dictionary; | ||||
|  | ||||
| import java.util.Objects; | ||||
|  | ||||
| /** | ||||
|  * Represent information of how word form created form it imutible part. | ||||
|  */ | ||||
| @@ -76,9 +74,11 @@ public class FlexiaModel { | ||||
|  | ||||
|         FlexiaModel that = (FlexiaModel) o; | ||||
|  | ||||
|         if (!Objects.equals(code, that.code)) return false; | ||||
|         if (!Objects.equals(prefix, that.prefix)) return false; | ||||
|         return Objects.equals(suffix, that.suffix); | ||||
|         if (code != null ? !code.equals(that.code) : that.code != null) return false; | ||||
|         if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false; | ||||
|         if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false; | ||||
|  | ||||
|         return true; | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|   | ||||
| @@ -29,8 +29,8 @@ import java.util.Map; | ||||
| public class GrammarReader { | ||||
|     private String fileName; | ||||
|     private String fileEncoding = "windows-1251"; | ||||
|     private List<String> grammarInfo = new ArrayList<>(); | ||||
|     private Map<String, Integer> inverseIndex = new HashMap<>(); | ||||
|     private List<String> grammarInfo = new ArrayList<String>(); | ||||
|     private Map<String, Integer> inverseIndex = new HashMap<String, Integer>(); | ||||
|  | ||||
|     public GrammarReader(String fileName) throws IOException { | ||||
|         this.fileName = fileName; | ||||
| @@ -50,7 +50,7 @@ public class GrammarReader { | ||||
|             line = line.trim(); | ||||
|             if (!line.startsWith("//") && line.length() > 0) { | ||||
|                 String[] strings = line.split(" ", 2); | ||||
|                 int i = grammarInfo.size(); | ||||
|                 Integer i = grammarInfo.size(); | ||||
|                 inverseIndex.put(strings[0], i); | ||||
|                 grammarInfo.add(i, strings[1]); | ||||
|             } | ||||
| @@ -63,7 +63,7 @@ public class GrammarReader { | ||||
|     } | ||||
|  | ||||
|     public String[] getGrammarInfoAsArray() { | ||||
|         return grammarInfo.toArray(new String[0]); | ||||
|         return grammarInfo.toArray(new String[grammarInfo.size()]); | ||||
|     } | ||||
|  | ||||
|     public Map<String, Integer> getGrammarInverseIndex() { | ||||
|   | ||||
| @@ -15,7 +15,7 @@ | ||||
|  */ | ||||
| package org.apache.lucene.morphology.dictionary; | ||||
|  | ||||
| import java.util.Collections; | ||||
| import java.util.Arrays; | ||||
| import java.util.LinkedList; | ||||
| import java.util.List; | ||||
|  | ||||
| @@ -29,7 +29,7 @@ public class RemoveFlexiaWithPrefixes extends WordFilter { | ||||
|     @Override | ||||
|     public List<WordCard> transform(WordCard wordCard) { | ||||
|  | ||||
|         List<FlexiaModel> flexiaModelsToRemove = new LinkedList<>(); | ||||
|         List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>(); | ||||
|         for (FlexiaModel fm : wordCard.getWordsForms()) { | ||||
|             if (fm.getPrefix().length() > 0) { | ||||
|                 flexiaModelsToRemove.add(fm); | ||||
| @@ -39,6 +39,6 @@ public class RemoveFlexiaWithPrefixes extends WordFilter { | ||||
|             wordCard.removeFlexia(fm); | ||||
|         } | ||||
|  | ||||
|         return new LinkedList<>(Collections.singletonList(wordCard)); | ||||
|         return new LinkedList<WordCard>(Arrays.asList(wordCard)); | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -32,13 +32,13 @@ public class RussianAdvSplitterFilter extends WordFilter { | ||||
|  | ||||
|     @Override | ||||
|     public List<WordCard> transform(WordCard wordCard) { | ||||
|         LinkedList<WordCard> result = new LinkedList<>(); | ||||
|         LinkedList<WordCard> result = new LinkedList<WordCard>(); | ||||
|         result.add(wordCard); | ||||
|  | ||||
|         String baseWord = ""; | ||||
|         String canonicalForm = ""; | ||||
|         String canonicalSuffix = ""; | ||||
|         List<FlexiaModel> flexiaModels = new LinkedList<>(); | ||||
|         List<FlexiaModel> flexiaModels = new LinkedList<FlexiaModel>(); | ||||
|         for (FlexiaModel flexiaModel : wordCard.getWordsForms()) { | ||||
|             if (flexiaModel.getPrefix().length() > 0) { | ||||
|                 flexiaModels.add(new FlexiaModel(flexiaModel.getCode(), flexiaModel.getSuffix(), "")); | ||||
|   | ||||
| @@ -27,9 +27,9 @@ import java.util.*; | ||||
|  | ||||
| //todo made refactoring this class | ||||
| public class StatisticsCollector implements WordProcessor { | ||||
|     private TreeMap<String, Set<Heuristic>> inverseIndex = new TreeMap<>(); | ||||
|     private Map<Set<Heuristic>, Integer> ruleInverseIndex = new HashMap<>(); | ||||
|     private List<Set<Heuristic>> rules = new ArrayList<>(); | ||||
|     private TreeMap<String, Set<Heuristic>> inverseIndex = new TreeMap<String, Set<Heuristic>>(); | ||||
|     private Map<Set<Heuristic>, Integer> ruleInverseIndex = new HashMap<Set<Heuristic>, Integer>(); | ||||
|     private List<Set<Heuristic>> rules = new ArrayList<Set<Heuristic>>(); | ||||
|     private GrammarReader grammarReader; | ||||
|     private LetterDecoderEncoder decoderEncoder; | ||||
|  | ||||
| @@ -39,14 +39,18 @@ public class StatisticsCollector implements WordProcessor { | ||||
|         this.decoderEncoder = decoderEncoder; | ||||
|     } | ||||
|  | ||||
|     public void process(WordCard wordCard) { | ||||
|     public void process(WordCard wordCard) throws IOException { | ||||
|         cleanWordCard(wordCard); | ||||
|         String normalStringMorph = wordCard.getWordsForms().get(0).getCode(); | ||||
|  | ||||
|         for (FlexiaModel fm : wordCard.getWordsForms()) { | ||||
|             Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph); | ||||
|             String form = revertWord(fm.create(wordCard.getBase())); | ||||
|             Set<Heuristic> suffixHeuristics = inverseIndex.computeIfAbsent(form, k -> new HashSet<>()); | ||||
|             Set<Heuristic> suffixHeuristics = inverseIndex.get(form); | ||||
|             if (suffixHeuristics == null) { | ||||
|                 suffixHeuristics = new HashSet<Heuristic>(); | ||||
|                 inverseIndex.put(form, suffixHeuristics); | ||||
|             } | ||||
|             suffixHeuristics.add(heuristic); | ||||
|         } | ||||
|     } | ||||
| @@ -65,7 +69,7 @@ public class StatisticsCollector implements WordProcessor { | ||||
|  | ||||
|     public void saveHeuristic(String fileName) throws IOException { | ||||
|  | ||||
|         Map<Integer, Integer> dist = new TreeMap<>(); | ||||
|         Map<Integer, Integer> dist = new TreeMap<Integer, Integer>(); | ||||
|         Set<Heuristic> prevSet = null; | ||||
|         int count = 0; | ||||
|         for (String key : inverseIndex.keySet()) { | ||||
| @@ -116,11 +120,11 @@ public class StatisticsCollector implements WordProcessor { | ||||
|     } | ||||
|  | ||||
|     private String revertWord(String s) { | ||||
|         StringBuilder result = new StringBuilder(); | ||||
|         String result = ""; | ||||
|         for (int i = 1; i <= s.length(); i++) { | ||||
|             result.append(s.charAt(s.length() - i)); | ||||
|             result += s.charAt(s.length() - i); | ||||
|         } | ||||
|         return result.toString(); | ||||
|         return result; | ||||
|     } | ||||
|  | ||||
|  | ||||
| @@ -128,15 +132,15 @@ public class StatisticsCollector implements WordProcessor { | ||||
|         String form = fm.create(wordBase); | ||||
|         String normalForm = wordBase + canonicalSuffix; | ||||
|         Integer length = getCommonLength(form, normalForm); | ||||
|         int actualSuffixLengh = form.length() - length; | ||||
|         Integer actualSuffixLengh = form.length() - length; | ||||
|         String actualNormalSuffix = normalForm.substring(length); | ||||
|         Integer integer = grammarReader.getGrammarInverseIndex().get(fm.getCode()); | ||||
|         Integer nf = grammarReader.getGrammarInverseIndex().get(normalSuffixForm); | ||||
|         return new Heuristic((byte) actualSuffixLengh, actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue()); | ||||
|         return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue()); | ||||
|     } | ||||
|  | ||||
|     public static Integer getCommonLength(String s1, String s2) { | ||||
|         int length = Math.min(s1.length(), s2.length()); | ||||
|         Integer length = Math.min(s1.length(), s2.length()); | ||||
|         for (int i = 0; i < length; i++) { | ||||
|             if (s1.charAt(i) != s2.charAt(i)) return i; | ||||
|         } | ||||
|   | ||||
| @@ -26,7 +26,7 @@ public class WordCard { | ||||
|     private String canonicalForm; | ||||
|     private String base; | ||||
|     private String canonicalSuffix; | ||||
|     private List<FlexiaModel> wordsForms = new ArrayList<>(); | ||||
|     private List<FlexiaModel> wordsForms = new ArrayList<FlexiaModel>(); | ||||
|  | ||||
|     public WordCard(String canonicalForm, String base, String canonicalSuffix) { | ||||
|         this.canonicalForm = canonicalForm; | ||||
|   | ||||
| @@ -17,6 +17,7 @@ package org.apache.lucene.morphology.dictionary; | ||||
|  | ||||
| import org.apache.lucene.morphology.LetterDecoderEncoder; | ||||
|  | ||||
| import java.util.Arrays; | ||||
| import java.util.Collections; | ||||
| import java.util.LinkedList; | ||||
| import java.util.List; | ||||
| @@ -37,7 +38,7 @@ public class WordCleaner extends WordFilter { | ||||
|         if (word.contains("-")) return Collections.emptyList(); | ||||
|         if (!decoderEncoder.checkString(word)) return Collections.emptyList(); | ||||
|  | ||||
|         List<FlexiaModel> flexiaModelsToRemove = new LinkedList<>(); | ||||
|         List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>(); | ||||
|         for (FlexiaModel fm : wordCard.getWordsForms()) { | ||||
|             if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) { | ||||
|                 flexiaModelsToRemove.add(fm); | ||||
| @@ -47,6 +48,6 @@ public class WordCleaner extends WordFilter { | ||||
|             wordCard.removeFlexia(fm); | ||||
|         } | ||||
|  | ||||
|         return new LinkedList<>(Collections.singletonList(wordCard)); | ||||
|         return new LinkedList<WordCard>(Arrays.asList(wordCard)); | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -23,5 +23,5 @@ import java.io.IOException; | ||||
|  */ | ||||
| public interface WordProcessor { | ||||
|  | ||||
|     void process(WordCard wordCard) throws IOException; | ||||
|     public void process(WordCard wordCard) throws IOException; | ||||
| } | ||||
|   | ||||
| @@ -17,7 +17,7 @@ package org.apache.lucene.morphology.dictionary; | ||||
|  | ||||
| import org.apache.lucene.morphology.LetterDecoderEncoder; | ||||
|  | ||||
| import java.util.Collections; | ||||
| import java.util.Arrays; | ||||
| import java.util.LinkedList; | ||||
| import java.util.List; | ||||
|  | ||||
| @@ -42,7 +42,7 @@ public class WordStringCleaner extends WordFilter { | ||||
|             //made correct code | ||||
|             m.setCode(m.getCode().substring(0, 2)); | ||||
|         } | ||||
|         return new LinkedList<>(Collections.singletonList(wordCard)); | ||||
|         return new LinkedList<WordCard>(Arrays.asList(wordCard)); | ||||
|     } | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -29,7 +29,7 @@ public class EnglishHeuristicBuilder { | ||||
|         GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/egramtab.tab"); | ||||
|         EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder(); | ||||
|  | ||||
|         DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<>()); | ||||
|         DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>()); | ||||
|  | ||||
|         StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder); | ||||
|         WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector); | ||||
|   | ||||
| @@ -28,7 +28,7 @@ public class RussianHeuristicBuilder { | ||||
|         GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/rgramtab.tab"); | ||||
|         RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder(); | ||||
|  | ||||
|         DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<>()); | ||||
|         DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>()); | ||||
|  | ||||
|         StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder); | ||||
|         WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector); | ||||
|   | ||||
| @@ -23,7 +23,6 @@ import org.apache.lucene.morphology.english.EnglishMorphology; | ||||
| import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder; | ||||
| import org.apache.lucene.morphology.russian.RussianLuceneMorphology; | ||||
| import org.apache.lucene.morphology.russian.RussianMorphology; | ||||
| import org.hamcrest.MatcherAssert; | ||||
| import org.junit.Before; | ||||
| import org.junit.Test; | ||||
|  | ||||
| @@ -34,6 +33,7 @@ import java.util.Map; | ||||
| import java.util.concurrent.atomic.AtomicLong; | ||||
|  | ||||
| import static org.hamcrest.Matchers.hasItem; | ||||
| import static org.junit.Assert.assertThat; | ||||
|  | ||||
|  | ||||
| public class TestAllWords { | ||||
| @@ -73,20 +73,22 @@ public class TestAllWords { | ||||
|         final List<String> morphInfo = grammarInfo.getGrammarInfo(); | ||||
|         final Map<String, Integer> inversIndex = grammarInfo.getGrammarInverseIndex(); | ||||
|  | ||||
|         DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet<>()); | ||||
|         DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet<String>()); | ||||
|  | ||||
|         final AtomicLong wordCount = new AtomicLong(0); | ||||
|         long startTime = System.currentTimeMillis(); | ||||
|         Long startTime = System.currentTimeMillis(); | ||||
|  | ||||
|         WordProcessor wordProcessor = wordCard -> { | ||||
|         WordProcessor wordProcessor = new WordProcessor() { | ||||
|             public void process(WordCard wordCard) throws IOException { | ||||
|                 String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); | ||||
|                 for (FlexiaModel fm : wordCard.getWordsForms()) { | ||||
|                     String wordForm = wordCard.getBase() + fm.getSuffix(); | ||||
|                     String morph = morphInfo.get(inversIndex.get(fm.getCode())); | ||||
|                 MatcherAssert.assertThat(morphology.getMorphInfo(wordForm), hasItem(word + "|" + morph)); | ||||
|                 MatcherAssert.assertThat(morphology.getNormalForms(wordForm), hasItem(word)); | ||||
|                     assertThat(morphology.getMorphInfo(wordForm), hasItem(word + "|" + morph)); | ||||
|                     assertThat(morphology.getNormalForms(wordForm), hasItem(word)); | ||||
|                     wordCount.set(2L + wordCount.get()); | ||||
|                 } | ||||
|             } | ||||
|         }; | ||||
|  | ||||
|         WordCleaner wordCleaner = new WordCleaner(decoderEncoder, wordProcessor); | ||||
| @@ -121,16 +123,18 @@ public class TestAllWords { | ||||
|  | ||||
|     private void testAllWordForLucene(final LuceneMorphology morphology, LetterDecoderEncoder decoderEncoder, String pathToDic) throws IOException { | ||||
|         final AtomicLong wordCount = new AtomicLong(0); | ||||
|         long startTime = System.currentTimeMillis(); | ||||
|         Long startTime = System.currentTimeMillis(); | ||||
|  | ||||
|         DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<>()); | ||||
|         WordProcessor wordProcessor = wordCard -> { | ||||
|         DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<String>()); | ||||
|         WordProcessor wordProcessor = new WordProcessor() { | ||||
|             public void process(WordCard wordCard) throws IOException { | ||||
|                 String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); | ||||
|                 for (FlexiaModel fm : wordCard.getWordsForms()) { | ||||
|                     String wordForm = wordCard.getBase() + fm.getSuffix(); | ||||
|                 MatcherAssert.assertThat(morphology.getNormalForms(wordForm), hasItem(word)); | ||||
|                     assertThat(morphology.getNormalForms(wordForm), hasItem(word)); | ||||
|                     wordCount.set(1L + wordCount.get()); | ||||
|                 } | ||||
|             } | ||||
|         }; | ||||
|  | ||||
|         WordCleaner wordCleaner = new WordCleaner(decoderEncoder, wordProcessor); | ||||
|   | ||||
| @@ -16,12 +16,6 @@ | ||||
| package org.apache.lucene.morphology; | ||||
| 
 | ||||
| import org.apache.lucene.analysis.Analyzer; | ||||
| import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; | ||||
| import org.apache.lucene.analysis.CharArraySet; | ||||
| import org.apache.lucene.analysis.LowerCaseFilter; | ||||
| import org.apache.lucene.analysis.TokenFilter; | ||||
| import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; | ||||
| import org.apache.lucene.analysis.standard.StandardTokenizer; | ||||
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||||
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | ||||
| import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer; | ||||
| @@ -31,17 +25,16 @@ import org.apache.lucene.analysis.TokenStream; | ||||
| import org.apache.lucene.morphology.english.EnglishLuceneMorphology; | ||||
| import org.apache.lucene.morphology.russian.RussianAnalyzer; | ||||
| import org.apache.lucene.morphology.russian.RussianLuceneMorphology; | ||||
| import org.hamcrest.MatcherAssert; | ||||
| import org.junit.Test; | ||||
| 
 | ||||
| import java.io.*; | ||||
| import java.nio.charset.StandardCharsets; | ||||
| import java.util.*; | ||||
| 
 | ||||
| import static org.hamcrest.Matchers.equalTo; | ||||
| import static org.junit.Assert.assertThat; | ||||
| 
 | ||||
| 
 | ||||
| public class TestAnalyzers extends BaseTokenStreamTestCase { | ||||
| public class AnalyzersTest { | ||||
| 
 | ||||
|     @Test | ||||
|     public void shouldGiveCorrectWordsForEnglish() throws IOException { | ||||
| @@ -67,24 +60,24 @@ public class TestAnalyzers extends BaseTokenStreamTestCase { | ||||
|         LuceneMorphology englishLuceneMorphology = new EnglishLuceneMorphology(); | ||||
| 
 | ||||
|         MorphologyAnalyzer russianAnalyzer = new MorphologyAnalyzer(russianLuceneMorphology); | ||||
|         InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("тест пм тест".getBytes()), StandardCharsets.UTF_8); | ||||
|         InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("тест пм тест".getBytes()), "UTF-8"); | ||||
|         TokenStream stream = russianAnalyzer.tokenStream(null, reader); | ||||
|         MorphologyFilter englishFilter = new MorphologyFilter(stream, englishLuceneMorphology); | ||||
| 
 | ||||
|         englishFilter.reset(); | ||||
|         while (englishFilter.incrementToken()) { | ||||
|             System.out.println(englishFilter); | ||||
|             System.out.println(englishFilter.toString()); | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     @Test | ||||
|     public void shouldProvideCorrectIndentForWordWithMelitaForm() throws IOException { | ||||
|         Analyzer morphlogyAnalyzer = new RussianAnalyzer(); | ||||
|         InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), StandardCharsets.UTF_8); | ||||
|         InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), "UTF-8"); | ||||
| 
 | ||||
|         TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader); | ||||
|         tokenStream.reset(); | ||||
|         Set<String> foromsOfWine = new HashSet<>(); | ||||
|         Set<String> foromsOfWine = new HashSet<String>(); | ||||
|         foromsOfWine.add("вина"); | ||||
|         foromsOfWine.add("винo"); | ||||
|         boolean wordSeen = false; | ||||
| @@ -92,7 +85,7 @@ public class TestAnalyzers extends BaseTokenStreamTestCase { | ||||
|             CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); | ||||
|             PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class); | ||||
|             if(foromsOfWine.contains(charTerm.toString()) && wordSeen){ | ||||
|                 MatcherAssert.assertThat(position.getPositionIncrement(),equalTo(0)); | ||||
|                 assertThat(position.getPositionIncrement(),equalTo(0)); | ||||
|             } | ||||
|             if(foromsOfWine.contains(charTerm.toString())){ | ||||
|                 wordSeen = true; | ||||
| @@ -102,18 +95,18 @@ public class TestAnalyzers extends BaseTokenStreamTestCase { | ||||
| 
 | ||||
|     private void testAnalayzer(Analyzer morphlogyAnalyzer, String answerPath, String testPath) throws IOException { | ||||
|         InputStream stream = this.getClass().getResourceAsStream(answerPath); | ||||
|         BufferedReader breader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); | ||||
|         BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); | ||||
|         String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" "); | ||||
|         HashSet<String> answer = new HashSet<>(Arrays.asList(strings)); | ||||
|         HashSet<String> answer = new HashSet<String>(Arrays.asList(strings)); | ||||
|         stream.close(); | ||||
| 
 | ||||
|         stream = this.getClass().getResourceAsStream(testPath); | ||||
| 
 | ||||
|         InputStreamReader reader = new InputStreamReader(stream, StandardCharsets.UTF_8); | ||||
|         InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); | ||||
| 
 | ||||
|         TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader); | ||||
|         tokenStream.reset(); | ||||
|         HashSet<String> result = new HashSet<>(); | ||||
|         HashSet<String> result = new HashSet<String>(); | ||||
|         while (tokenStream.incrementToken()) { | ||||
|             CharTermAttribute attribute1 = tokenStream.getAttribute(CharTermAttribute.class); | ||||
|             result.add(attribute1.toString()); | ||||
| @@ -121,45 +114,6 @@ public class TestAnalyzers extends BaseTokenStreamTestCase { | ||||
| 
 | ||||
|         stream.close(); | ||||
| 
 | ||||
|         MatcherAssert.assertThat(result, equalTo(answer)); | ||||
|     } | ||||
| 
 | ||||
|     @Test | ||||
|     public void testPositionIncrement() throws IOException { | ||||
|         EnglishAnalyzer englishAnalyzer = new EnglishAnalyzer(); | ||||
|         assertTokenStreamContents( | ||||
|                 englishAnalyzer.tokenStream("test", "There are tests!"), | ||||
|                 new String[]{"there", "are", "be", "test"}, | ||||
|                 new int[]{0, 6, 6, 10}, | ||||
|                 new int[]{5, 9, 9, 15}, | ||||
|                 new String[]{"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>"}, | ||||
|                 new int[]{1, 1, 0, 1} | ||||
|         ); | ||||
|     } | ||||
| 
 | ||||
|     @Test | ||||
|     public void testKeywordHandling() throws IOException { | ||||
|         Analyzer analyzer = new EnglishKeywordTestAnalyzer(); | ||||
|         assertTokenStreamContents( | ||||
|                 analyzer.tokenStream("test", "Tests shouldn't be stemmed, but tests should!"), | ||||
|                 new String[]{"tests", "shouldn't", "be", "stem", "but", "test", "shall"} | ||||
|         ); | ||||
|     } | ||||
| 
 | ||||
|     private static class EnglishKeywordTestAnalyzer extends Analyzer { | ||||
|         @Override | ||||
|         protected TokenStreamComponents createComponents(String s) { | ||||
|             StandardTokenizer src = new StandardTokenizer(); | ||||
|             CharArraySet dontStem = new CharArraySet(1, false); | ||||
|             dontStem.add("Tests"); | ||||
|             TokenFilter filter = new SetKeywordMarkerFilter(src, dontStem); | ||||
|             filter = new LowerCaseFilter(filter); | ||||
|             try { | ||||
|                 filter = new MorphologyFilter(filter, new EnglishLuceneMorphology()); | ||||
|             } catch (IOException ex) { | ||||
|                 throw new RuntimeException("cannot create EnglishLuceneMorphology", ex); | ||||
|             } | ||||
|             return new TokenStreamComponents(src, filter); | ||||
|         } | ||||
|         assertThat(result, equalTo(answer)); | ||||
|     } | ||||
| } | ||||
| @@ -17,23 +17,22 @@ package org.apache.lucene.morphology; | ||||
| 
 | ||||
| import org.apache.lucene.morphology.russian.RussianLuceneMorphology; | ||||
| import org.apache.lucene.morphology.english.EnglishLuceneMorphology; | ||||
| import org.hamcrest.MatcherAssert; | ||||
| import org.junit.Test; | ||||
| 
 | ||||
| import java.io.BufferedReader; | ||||
| import java.io.IOException; | ||||
| import java.io.InputStream; | ||||
| import java.io.InputStreamReader; | ||||
| import java.nio.charset.StandardCharsets; | ||||
| import java.util.Arrays; | ||||
| import java.util.HashSet; | ||||
| import java.util.List; | ||||
| import java.util.Set; | ||||
| 
 | ||||
| import static org.hamcrest.CoreMatchers.equalTo; | ||||
| import static org.junit.Assert.assertThat; | ||||
| 
 | ||||
| 
 | ||||
| public class TestLuceneMorph { | ||||
| public class LuceneMorphTest { | ||||
| 
 | ||||
|     @Test | ||||
|     public void englishMorphologyShouldGetCorrectNormalForm() throws IOException { | ||||
| @@ -53,13 +52,14 @@ public class TestLuceneMorph { | ||||
| 
 | ||||
|     private void testMorphology(LuceneMorphology luceneMorph, String pathToTestData) throws IOException { | ||||
|         InputStream stream = this.getClass().getResourceAsStream(pathToTestData); | ||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); | ||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); | ||||
|         String s = bufferedReader.readLine(); | ||||
|         while (s != null) { | ||||
|             String[] qa = s.trim().split(" "); | ||||
|             Set<String> result = new HashSet<>(Arrays.asList(qa).subList(1, qa.length)); | ||||
|             Set<String> stringList = new HashSet<>(luceneMorph.getNormalForms(qa[0])); | ||||
|             MatcherAssert.assertThat(stringList, equalTo(result)); | ||||
|             Set<String> result = new HashSet<String>(); | ||||
|             result.addAll(Arrays.asList(qa).subList(1, qa.length)); | ||||
|             Set<String> stringList = new HashSet<String>(luceneMorph.getNormalForms(qa[0])); | ||||
|             assertThat(stringList, equalTo(result)); | ||||
|             s = bufferedReader.readLine(); | ||||
|         } | ||||
|     } | ||||
| @@ -3,20 +3,27 @@ | ||||
|     <parent> | ||||
|         <artifactId>morphology</artifactId> | ||||
|         <groupId>org.apache.lucene.morphology</groupId> | ||||
|         <version>1.5</version> | ||||
|         <version>1.2-SNAPSHOT</version> | ||||
|     </parent> | ||||
|     <modelVersion>4.0.0</modelVersion> | ||||
|     <groupId>org.apache.lucene.morphology</groupId> | ||||
|     <artifactId>english</artifactId> | ||||
|     <name>english</name> | ||||
|     <version>1.5</version> | ||||
|     <version>1.2-SNAPSHOT</version> | ||||
|     <url>http://maven.apache.org</url> | ||||
|     <dependencies> | ||||
|  | ||||
|         <dependency> | ||||
|             <groupId>org.apache.lucene.morphology</groupId> | ||||
|             <artifactId>morph</artifactId> | ||||
|             <version>1.5</version> | ||||
|             <version>1.2-SNAPSHOT</version> | ||||
|         </dependency> | ||||
|  | ||||
|         <dependency> | ||||
|             <groupId>junit</groupId> | ||||
|             <artifactId>junit</artifactId> | ||||
|             <version>4.8.2</version> | ||||
|             <scope>test</scope> | ||||
|         </dependency> | ||||
|     </dependencies> | ||||
| </project> | ||||
| @@ -32,7 +32,7 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder { | ||||
|         if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12); | ||||
|         int result = 0; | ||||
|         for (int i = 0; i < string.length(); i++) { | ||||
|             int c = string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET; | ||||
|             int c = 0 + string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET; | ||||
|             if (c == 45 - ENGLISH_SMALL_LETTER_OFFSET) { | ||||
|                 c = DASH_CODE; | ||||
|             } | ||||
| @@ -48,7 +48,7 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder { | ||||
|  | ||||
|     public int[] encodeToArray(String s) { | ||||
|  | ||||
|         ArrayList<Integer> integers = new ArrayList<>(); | ||||
|         ArrayList<Integer> integers = new ArrayList<Integer>(); | ||||
|         while (s.length() > 6) { | ||||
|             integers.add(encode(s.substring(0, 6))); | ||||
|             s = s.substring(6); | ||||
| @@ -64,16 +64,16 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder { | ||||
|     } | ||||
|  | ||||
|     public String decodeArray(int[] array) { | ||||
|         StringBuilder result = new StringBuilder(); | ||||
|         String result = ""; | ||||
|         for (int i : array) { | ||||
|             result.append(decode(i)); | ||||
|             result += decode(i); | ||||
|         } | ||||
|         return result.toString(); | ||||
|         return result; | ||||
|     } | ||||
|  | ||||
|  | ||||
|     public String decode(Integer suffixN) { | ||||
|         StringBuilder result = new StringBuilder(); | ||||
|         String result = ""; | ||||
|         while (suffixN > 27) { | ||||
|             int c = suffixN % 28 + ENGLISH_SMALL_LETTER_OFFSET; | ||||
|             if (c == ENGLISH_SMALL_LETTER_OFFSET) { | ||||
| @@ -81,20 +81,21 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder { | ||||
|                 continue; | ||||
|             } | ||||
|             if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR; | ||||
|             result.insert(0, (char) c); | ||||
|             result = (char) c + result; | ||||
|             suffixN /= 28; | ||||
|         } | ||||
|         long c = suffixN + ENGLISH_SMALL_LETTER_OFFSET; | ||||
|         if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR; | ||||
|         result.insert(0, (char) c); | ||||
|         return result.toString(); | ||||
|         result = (char) c + result; | ||||
|         return result; | ||||
|     } | ||||
|  | ||||
|     public boolean checkCharacter(char c) { | ||||
|         int code = c; | ||||
|         int code = 0 + c; | ||||
|         if (code == 45) return true; | ||||
|         code -= ENGLISH_SMALL_LETTER_OFFSET; | ||||
|         return code > 0 && code < 27; | ||||
|         if (code > 0 && code < 27) return true; | ||||
|         return false; | ||||
|     } | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -16,8 +16,7 @@ | ||||
| package org.apache.lucene.morphology.english; | ||||
|  | ||||
| import static org.hamcrest.core.IsEqual.equalTo; | ||||
|  | ||||
| import org.hamcrest.MatcherAssert; | ||||
| import static org.junit.Assert.assertThat; | ||||
| import org.junit.Before; | ||||
|  | ||||
|  | ||||
| @@ -31,11 +30,11 @@ public class EnglishLetterDecoderEncoderTest { | ||||
|  | ||||
|     @org.junit.Test | ||||
|     public void testDecodeEncodeToArray() { | ||||
|         MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("abcdefghijklmnopqrstuvwxyz")), equalTo("abcdefghijklmnopqrstuvwxyz")); | ||||
|         MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("xyz")), equalTo("xyz")); | ||||
|         MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrty")), equalTo("ytrrty")); | ||||
|         MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyz")), equalTo("ytrrtyz")); | ||||
|         MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyzqwqwe")), equalTo("ytrrtyzqwqwe")); | ||||
|         assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("abcdefghijklmnopqrstuvwxyz")), equalTo("abcdefghijklmnopqrstuvwxyz")); | ||||
|         assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("xyz")), equalTo("xyz")); | ||||
|         assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrty")), equalTo("ytrrty")); | ||||
|         assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyz")), equalTo("ytrrtyz")); | ||||
|         assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyzqwqwe")), equalTo("ytrrtyzqwqwe")); | ||||
|  | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -16,9 +16,9 @@ | ||||
| package org.apache.lucene.morphology.english.stemmer; | ||||
|  | ||||
| import org.apache.lucene.morphology.english.EnglishLuceneMorphology; | ||||
| import org.hamcrest.MatcherAssert; | ||||
| import org.junit.Test; | ||||
| import static org.hamcrest.core.IsEqual.equalTo; | ||||
| import static org.junit.Assert.assertThat; | ||||
|  | ||||
|  | ||||
| public class EnglishStemmerTest { | ||||
| @@ -26,24 +26,24 @@ public class EnglishStemmerTest { | ||||
|     public void testGetStemmedWord() throws Exception { | ||||
|         EnglishLuceneMorphology englishLuceneMorphology = new EnglishLuceneMorphology(); | ||||
|         EnglishStemmer englishStemmer = new EnglishStemmer(englishLuceneMorphology); | ||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("running"),equalTo("run")); | ||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("run"),equalTo("run")); | ||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("killed"),equalTo("kill")); | ||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("kill"),equalTo("kill")); | ||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("networking"),equalTo("network")); | ||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("network"),equalTo("network")); | ||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("statistics"),equalTo("statistic")); | ||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("statistic"),equalTo("statistic")); | ||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("stats"),equalTo("stat")); | ||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("stat"),equalTo("stat")); | ||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("countries"),equalTo("country")); | ||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("country"),equalTo("country")); | ||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("delete"),equalTo("delete")); | ||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("ended"),equalTo("end")); | ||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("end"),equalTo("end")); | ||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("ends"),equalTo("end")); | ||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("given"),equalTo("give")); | ||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("give"),equalTo("give")); | ||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("log4j"),equalTo("log4j")); | ||||
|         assertThat(englishStemmer.getStemmedWord("running"),equalTo("run")); | ||||
|         assertThat(englishStemmer.getStemmedWord("run"),equalTo("run")); | ||||
|         assertThat(englishStemmer.getStemmedWord("killed"),equalTo("kill")); | ||||
|         assertThat(englishStemmer.getStemmedWord("kill"),equalTo("kill")); | ||||
|         assertThat(englishStemmer.getStemmedWord("networking"),equalTo("network")); | ||||
|         assertThat(englishStemmer.getStemmedWord("network"),equalTo("network")); | ||||
|         assertThat(englishStemmer.getStemmedWord("statistics"),equalTo("statistic")); | ||||
|         assertThat(englishStemmer.getStemmedWord("statistic"),equalTo("statistic")); | ||||
|         assertThat(englishStemmer.getStemmedWord("stats"),equalTo("stat")); | ||||
|         assertThat(englishStemmer.getStemmedWord("stat"),equalTo("stat")); | ||||
|         assertThat(englishStemmer.getStemmedWord("countries"),equalTo("country")); | ||||
|         assertThat(englishStemmer.getStemmedWord("country"),equalTo("country")); | ||||
|         assertThat(englishStemmer.getStemmedWord("delete"),equalTo("delete")); | ||||
|         assertThat(englishStemmer.getStemmedWord("ended"),equalTo("end")); | ||||
|         assertThat(englishStemmer.getStemmedWord("end"),equalTo("end")); | ||||
|         assertThat(englishStemmer.getStemmedWord("ends"),equalTo("end")); | ||||
|         assertThat(englishStemmer.getStemmedWord("given"),equalTo("give")); | ||||
|         assertThat(englishStemmer.getStemmedWord("give"),equalTo("give")); | ||||
|         assertThat(englishStemmer.getStemmedWord("log4j"),equalTo("log4j")); | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
| Copyright 2009 Alexander Kuznetsov  | ||||
| Copyright ${project.inceptionYear} ${owner} | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| you may not use this file except in compliance with the License. | ||||
|   | ||||
| @@ -3,12 +3,13 @@ | ||||
|     <parent> | ||||
|         <artifactId>morphology</artifactId> | ||||
|         <groupId>org.apache.lucene.morphology</groupId> | ||||
|         <version>1.5</version> | ||||
|         <version>1.2-SNAPSHOT</version> | ||||
|     </parent> | ||||
|     <modelVersion>4.0.0</modelVersion> | ||||
|     <groupId>org.apache.lucene.morphology</groupId> | ||||
|     <artifactId>morph</artifactId> | ||||
|     <name>morph</name> | ||||
|     <version>1.5</version> | ||||
|     <version>1.2-SNAPSHOT</version> | ||||
|     <url>http://maven.apache.org</url> | ||||
|  | ||||
| </project> | ||||
|   | ||||
| @@ -21,7 +21,7 @@ import java.util.ArrayList; | ||||
|  | ||||
| public abstract class BaseLetterDecoderEncoder implements LetterDecoderEncoder { | ||||
|     public int[] encodeToArray(String s) { | ||||
|         ArrayList<Integer> integers = new ArrayList<>(); | ||||
|         ArrayList<Integer> integers = new ArrayList<Integer>(); | ||||
|         while (s.length() > 6) { | ||||
|             integers.add(encode(s.substring(0, 6))); | ||||
|             s = s.substring(6); | ||||
| @@ -37,11 +37,11 @@ public abstract class BaseLetterDecoderEncoder implements LetterDecoderEncoder { | ||||
|     } | ||||
|  | ||||
|     public String decodeArray(int[] array) { | ||||
|         StringBuilder result = new StringBuilder(); | ||||
|         String result = ""; | ||||
|         for (int i : array) { | ||||
|             result.append(decode(i)); | ||||
|             result += decode(i); | ||||
|         } | ||||
|         return result.toString(); | ||||
|         return result; | ||||
|     } | ||||
|  | ||||
|     public boolean checkString(String word) { | ||||
|   | ||||
| @@ -16,7 +16,6 @@ | ||||
| package org.apache.lucene.morphology; | ||||
|  | ||||
| import java.io.Serializable; | ||||
| import java.util.Objects; | ||||
|  | ||||
|  | ||||
| public class Heuristic implements Serializable { | ||||
| @@ -27,10 +26,10 @@ public class Heuristic implements Serializable { | ||||
|  | ||||
|     public Heuristic(String s) { | ||||
|         String[] strings = s.split("\\|"); | ||||
|         actualSuffixLength = Byte.parseByte(strings[0]); | ||||
|         actualSuffixLength = Byte.valueOf(strings[0]); | ||||
|         actualNormalSuffix = strings[1]; | ||||
|         formMorphInfo = Short.parseShort(strings[2]); | ||||
|         normalFormMorphInfo = Short.parseShort(strings[3]); | ||||
|         formMorphInfo = Short.valueOf(strings[2]); | ||||
|         normalFormMorphInfo = Short.valueOf(strings[3]); | ||||
|     } | ||||
|  | ||||
|     public Heuristic(byte actualSuffixLength, String actualNormalSuffix, short formMorphInfo, short normalFormMorphInfo) { | ||||
| @@ -71,12 +70,15 @@ public class Heuristic implements Serializable { | ||||
|         if (actualSuffixLength != heuristic.actualSuffixLength) return false; | ||||
|         if (formMorphInfo != heuristic.formMorphInfo) return false; | ||||
|         if (normalFormMorphInfo != heuristic.normalFormMorphInfo) return false; | ||||
|         return Objects.equals(actualNormalSuffix, heuristic.actualNormalSuffix); | ||||
|         if (actualNormalSuffix != null ? !actualNormalSuffix.equals(heuristic.actualNormalSuffix) : heuristic.actualNormalSuffix != null) | ||||
|             return false; | ||||
|  | ||||
|         return true; | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public int hashCode() { | ||||
|         int result = actualSuffixLength; | ||||
|         int result = (int) actualSuffixLength; | ||||
|         result = 31 * result + (actualNormalSuffix != null ? actualNormalSuffix.hashCode() : 0); | ||||
|         result = 31 * result + (int) formMorphInfo; | ||||
|         result = 31 * result + (int) normalFormMorphInfo; | ||||
|   | ||||
| @@ -17,17 +17,17 @@ package org.apache.lucene.morphology; | ||||
|  | ||||
|  | ||||
| public interface LetterDecoderEncoder { | ||||
|     Integer encode(String string); | ||||
|     public Integer encode(String string); | ||||
|  | ||||
|     int[] encodeToArray(String s); | ||||
|     public int[] encodeToArray(String s); | ||||
|  | ||||
|     String decodeArray(int[] array); | ||||
|     public String decodeArray(int[] array); | ||||
|  | ||||
|     String decode(Integer suffixN); | ||||
|     public String decode(Integer suffixN); | ||||
|  | ||||
|     boolean checkCharacter(char c); | ||||
|     public boolean checkCharacter(char c); | ||||
|  | ||||
|     boolean checkString(String word); | ||||
|     public boolean checkString(String word); | ||||
|  | ||||
|     String cleanString(String s); | ||||
|     public String cleanString(String s); | ||||
| } | ||||
|   | ||||
| @@ -34,13 +34,13 @@ public class LuceneMorphology extends MorphologyImpl { | ||||
|  | ||||
|     protected void readRules(BufferedReader bufferedReader) throws IOException { | ||||
|         String s; | ||||
|         int amount; | ||||
|         Integer amount; | ||||
|         s = bufferedReader.readLine(); | ||||
|         amount = Integer.parseInt(s); | ||||
|         amount = Integer.valueOf(s); | ||||
|         rules = new Heuristic[amount][]; | ||||
|         for (int i = 0; i < amount; i++) { | ||||
|             String s1 = bufferedReader.readLine(); | ||||
|             int ruleLenght = Integer.parseInt(s1); | ||||
|             Integer ruleLenght = Integer.valueOf(s1); | ||||
|             Heuristic[] heuristics = new Heuristic[ruleLenght]; | ||||
|             for (int j = 0; j < ruleLenght; j++) { | ||||
|                 heuristics[j] = new Heuristic(bufferedReader.readLine()); | ||||
| @@ -51,7 +51,7 @@ public class LuceneMorphology extends MorphologyImpl { | ||||
|  | ||||
|  | ||||
|     private Heuristic[] modeifyHeuristic(Heuristic[] heuristics) { | ||||
|         ArrayList<Heuristic> result = new ArrayList<>(); | ||||
|         ArrayList<Heuristic> result = new ArrayList<Heuristic>(); | ||||
|         for (Heuristic heuristic : heuristics) { | ||||
|             boolean isAdded = true; | ||||
|             for (Heuristic ch : result) { | ||||
| @@ -61,7 +61,7 @@ public class LuceneMorphology extends MorphologyImpl { | ||||
|                 result.add(heuristic); | ||||
|             } | ||||
|         } | ||||
|         return result.toArray(new Heuristic[0]); | ||||
|         return result.toArray(new Heuristic[result.size()]); | ||||
|     } | ||||
|  | ||||
|     public boolean checkString(String s) { | ||||
|   | ||||
| @@ -17,7 +17,6 @@ package org.apache.lucene.morphology; | ||||
|  | ||||
|  | ||||
| import java.io.*; | ||||
| import java.nio.charset.StandardCharsets; | ||||
| import java.util.ArrayList; | ||||
| import java.util.List; | ||||
|  | ||||
| @@ -48,7 +47,7 @@ public class MorphologyImpl implements Morphology { | ||||
|     } | ||||
|  | ||||
|     public List<String> getNormalForms(String s) { | ||||
|         ArrayList<String> result = new ArrayList<>(); | ||||
|         ArrayList<String> result = new ArrayList<String>(); | ||||
|         int[] ints = decoderEncoder.encodeToArray(revertWord(s)); | ||||
|         int ruleId = findRuleId(ints); | ||||
|         boolean notSeenEmptyString = true; | ||||
| @@ -65,7 +64,7 @@ public class MorphologyImpl implements Morphology { | ||||
|     } | ||||
|  | ||||
|     public List<String> getMorphInfo(String s) { | ||||
|         ArrayList<String> result = new ArrayList<>(); | ||||
|         ArrayList<String> result = new ArrayList<String>(); | ||||
|         int[] ints = decoderEncoder.encodeToArray(revertWord(s)); | ||||
|         int ruleId = findRuleId(ints); | ||||
|         for (Heuristic h : rules[rulesId[ruleId]]) { | ||||
| @@ -101,14 +100,14 @@ public class MorphologyImpl implements Morphology { | ||||
|     private int compareToInts(int[] i1, int[] i2) { | ||||
|         int minLength = Math.min(i1.length, i2.length); | ||||
|         for (int i = 0; i < minLength; i++) { | ||||
|             int i3 = Integer.compare(i1[i], i2[i]); | ||||
|             int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1); | ||||
|             if (i3 != 0) return i3; | ||||
|         } | ||||
|         return i1.length - i2.length; | ||||
|     } | ||||
|  | ||||
|     public void writeToFile(String fileName) throws IOException { | ||||
|         OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8); | ||||
|         OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8"); | ||||
|         writer.write(separators.length + "\n"); | ||||
|         for (int[] i : separators) { | ||||
|             writer.write(i.length + "\n"); | ||||
| @@ -139,7 +138,7 @@ public class MorphologyImpl implements Morphology { | ||||
|     } | ||||
|  | ||||
|     private void readFromInputStream(InputStream inputStream) throws IOException { | ||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8)); | ||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")); | ||||
|         String s = bufferedReader.readLine(); | ||||
|         Integer amount = Integer.valueOf(s); | ||||
|  | ||||
| @@ -154,9 +153,9 @@ public class MorphologyImpl implements Morphology { | ||||
|  | ||||
|     private void readGrammaInfo(BufferedReader bufferedReader) throws IOException { | ||||
|         String s; | ||||
|         int amount; | ||||
|         Integer amount; | ||||
|         s = bufferedReader.readLine(); | ||||
|         amount = Integer.parseInt(s); | ||||
|         amount = Integer.valueOf(s); | ||||
|         grammarInfo = new String[amount]; | ||||
|         for (int i = 0; i < amount; i++) { | ||||
|             grammarInfo[i] = bufferedReader.readLine(); | ||||
| @@ -165,13 +164,13 @@ public class MorphologyImpl implements Morphology { | ||||
|  | ||||
|     protected void readRules(BufferedReader bufferedReader) throws IOException { | ||||
|         String s; | ||||
|         int amount; | ||||
|         Integer amount; | ||||
|         s = bufferedReader.readLine(); | ||||
|         amount = Integer.parseInt(s); | ||||
|         amount = Integer.valueOf(s); | ||||
|         rules = new Heuristic[amount][]; | ||||
|         for (int i = 0; i < amount; i++) { | ||||
|             String s1 = bufferedReader.readLine(); | ||||
|             int ruleLength = Integer.parseInt(s1); | ||||
|             Integer ruleLength = Integer.valueOf(s1); | ||||
|             rules[i] = new Heuristic[ruleLength]; | ||||
|             for (int j = 0; j < ruleLength; j++) { | ||||
|                 rules[i][j] = new Heuristic(bufferedReader.readLine()); | ||||
| @@ -183,7 +182,7 @@ public class MorphologyImpl implements Morphology { | ||||
|         rulesId = new short[amount]; | ||||
|         for (int i = 0; i < amount; i++) { | ||||
|             String s1 = bufferedReader.readLine(); | ||||
|             rulesId[i] = Short.parseShort(s1); | ||||
|             rulesId[i] = Short.valueOf(s1); | ||||
|         } | ||||
|     } | ||||
|  | ||||
| @@ -191,10 +190,10 @@ public class MorphologyImpl implements Morphology { | ||||
|         separators = new int[amount][]; | ||||
|         for (int i = 0; i < amount; i++) { | ||||
|             String s1 = bufferedReader.readLine(); | ||||
|             int wordLenght = Integer.parseInt(s1); | ||||
|             Integer wordLenght = Integer.valueOf(s1); | ||||
|             separators[i] = new int[wordLenght]; | ||||
|             for (int j = 0; j < wordLenght; j++) { | ||||
|                 separators[i][j] = Integer.parseInt(bufferedReader.readLine()); | ||||
|                 separators[i][j] = Integer.valueOf(bufferedReader.readLine()); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|   | ||||
| @@ -17,10 +17,11 @@ | ||||
| package org.apache.lucene.morphology.analyzer; | ||||
|  | ||||
| import org.apache.lucene.analysis.Analyzer; | ||||
| import org.apache.lucene.analysis.LowerCaseFilter; | ||||
| import org.apache.lucene.analysis.TokenFilter; | ||||
| import org.apache.lucene.analysis.core.LowerCaseFilter; | ||||
| import org.apache.lucene.analysis.payloads.PayloadEncoder; | ||||
| import org.apache.lucene.analysis.payloads.PayloadHelper; | ||||
| import org.apache.lucene.analysis.standard.StandardFilter; | ||||
| import org.apache.lucene.analysis.standard.StandardTokenizer; | ||||
| import org.apache.lucene.morphology.LetterDecoderEncoder; | ||||
| import org.apache.lucene.morphology.LuceneMorphology; | ||||
| @@ -28,7 +29,7 @@ import org.apache.lucene.util.BytesRef; | ||||
|  | ||||
| import java.io.IOException; | ||||
| import java.io.InputStream; | ||||
|  | ||||
| import java.io.Reader; | ||||
|  | ||||
| public class MorphologyAnalyzer extends Analyzer { | ||||
|     private LuceneMorphology luceneMorph; | ||||
| @@ -50,29 +51,17 @@ public class MorphologyAnalyzer extends Analyzer { | ||||
|     protected TokenStreamComponents createComponents(String s) { | ||||
|  | ||||
|         StandardTokenizer src = new StandardTokenizer(); | ||||
|         final PayloadEncoder encoder = new PayloadEncoder() { | ||||
|             @Override | ||||
|             public BytesRef encode(char[] buffer) { | ||||
|                 final Float payload = Float.valueOf(new String(buffer)); | ||||
|                 System.out.println(payload); | ||||
|                 final byte[] bytes = PayloadHelper.encodeFloat(payload); | ||||
|                 return new BytesRef(bytes, 0, bytes.length); | ||||
|             } | ||||
|  | ||||
|             @Override | ||||
|             public BytesRef encode(char[] buffer, int offset, int length) { | ||||
|  | ||||
|                 final Float payload = Float.valueOf(new String(buffer, offset, length)); | ||||
|                 System.out.println(payload); | ||||
|                 final byte[] bytes = PayloadHelper.encodeFloat(payload); | ||||
|  | ||||
|                 return new BytesRef(bytes, 0, bytes.length); | ||||
|             } | ||||
|         }; | ||||
|  | ||||
|         TokenFilter filter = new LowerCaseFilter(src); | ||||
|         TokenFilter filter = new StandardFilter(src); | ||||
|         filter = new LowerCaseFilter(filter); | ||||
|         filter = new MorphologyFilter(filter, luceneMorph); | ||||
|  | ||||
|         return new TokenStreamComponents(src::setReader, filter); | ||||
|         return new TokenStreamComponents(src, filter) { | ||||
|             @Override | ||||
|             protected void setReader(final Reader reader) throws IOException { | ||||
|                 super.setReader(reader); | ||||
|             } | ||||
|         }; | ||||
|     } | ||||
|  | ||||
|  | ||||
| } | ||||
|   | ||||
| @@ -19,22 +19,18 @@ package org.apache.lucene.morphology.analyzer; | ||||
| import org.apache.lucene.analysis.TokenFilter; | ||||
| import org.apache.lucene.analysis.TokenStream; | ||||
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||||
| import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; | ||||
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | ||||
| import org.apache.lucene.morphology.LuceneMorphology; | ||||
|  | ||||
| import java.io.IOException; | ||||
| import java.util.Iterator; | ||||
| import java.util.List; | ||||
|  | ||||
|  | ||||
| public class MorphologyFilter extends TokenFilter { | ||||
|     private LuceneMorphology luceneMorph; | ||||
|     private Iterator<String> iterator; | ||||
|     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); | ||||
|     private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); | ||||
|     private final PositionIncrementAttribute position = addAttribute(PositionIncrementAttribute.class); | ||||
|     private State state = null; | ||||
|  | ||||
|     public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) { | ||||
|         super(tokenStream); | ||||
| @@ -43,45 +39,27 @@ public class MorphologyFilter extends TokenFilter { | ||||
|  | ||||
|  | ||||
|     final public boolean incrementToken() throws IOException { | ||||
|         if (iterator != null) { | ||||
|             if (iterator.hasNext()) { | ||||
|                 restoreState(state); | ||||
|                 position.setPositionIncrement(0); | ||||
|                 termAtt.setEmpty().append(iterator.next()); | ||||
|                 return true; | ||||
|             } else { | ||||
|                 state = null; | ||||
|                 iterator = null; | ||||
|             } | ||||
|         } | ||||
|         while (true) { | ||||
|         boolean oldToken = true; | ||||
|         while (iterator == null || !iterator.hasNext()) { | ||||
|             boolean b = input.incrementToken(); | ||||
|             if (!b) { | ||||
|                 return false; | ||||
|             } | ||||
|             if (!keywordAttr.isKeyword() && termAtt.length() > 0) { | ||||
|             String s = new String(termAtt.buffer(), 0, termAtt.length()); | ||||
|             if (luceneMorph.checkString(s)) { | ||||
|                     List<String> forms = luceneMorph.getNormalForms(s); | ||||
|                     if (forms.isEmpty()) { | ||||
|                         continue; | ||||
|                     } else if (forms.size() == 1) { | ||||
|                         termAtt.setEmpty().append(forms.get(0)); | ||||
|                 oldToken = false; | ||||
|                 iterator = luceneMorph.getNormalForms(s).iterator(); | ||||
|             } else { | ||||
|                         state = captureState(); | ||||
|                         iterator = forms.iterator(); | ||||
|                         termAtt.setEmpty().append(iterator.next()); | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|                 return true; | ||||
|             } | ||||
|         } | ||||
|         String s = iterator.next(); | ||||
|         termAtt.setEmpty(); | ||||
|         termAtt.append(s); | ||||
|         if (oldToken) { | ||||
|             position.setPositionIncrement(0); | ||||
|         } | ||||
|         return true; | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public void reset() throws IOException { | ||||
|         super.reset(); | ||||
|         state = null; | ||||
|         iterator = null; | ||||
|     } | ||||
| } | ||||
|   | ||||
							
								
								
									
										118
									
								
								pom.xml
									
									
									
									
									
								
							
							
						
						
									
										118
									
								
								pom.xml
									
									
									
									
									
								
							| @@ -1,10 +1,11 @@ | ||||
| <?xml version="1.0" encoding="UTF-8"?> | ||||
| <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> | ||||
| <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||||
|          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> | ||||
|     <modelVersion>4.0.0</modelVersion> | ||||
|     <groupId>org.apache.lucene.morphology</groupId> | ||||
|     <artifactId>morphology</artifactId> | ||||
|     <packaging>pom</packaging> | ||||
|     <version>1.5</version> | ||||
|     <version>1.2-SNAPSHOT</version> | ||||
|     <name>morphology</name> | ||||
|     <url>http://maven.apache.org</url> | ||||
|  | ||||
| @@ -15,12 +16,6 @@ | ||||
|         <tag>HEAD</tag> | ||||
|     </scm> | ||||
|  | ||||
|     <properties> | ||||
|         <lucene.version>9.3.0</lucene.version> | ||||
|         <morphology.version>1.5</morphology.version> | ||||
|         <junit.version>4.13</junit.version> | ||||
|     </properties> | ||||
|  | ||||
|     <distributionManagement> | ||||
|         <repository> | ||||
|             <id>bintray</id> | ||||
| @@ -28,36 +23,28 @@ | ||||
|         </repository> | ||||
|     </distributionManagement> | ||||
|  | ||||
|     <licenses> | ||||
|         <license> | ||||
|             <name>Apache License, Version 2.0</name> | ||||
|             <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url> | ||||
|             <distribution>repo</distribution> | ||||
|         </license> | ||||
|     </licenses> | ||||
|  | ||||
|     <dependencies> | ||||
|         <dependency> | ||||
|             <groupId>org.apache.lucene</groupId> | ||||
|             <artifactId>lucene-test-framework</artifactId> | ||||
|             <version>${lucene.version}</version> | ||||
|             <groupId>junit</groupId> | ||||
|             <artifactId>junit</artifactId> | ||||
|             <version>4.8.2</version> | ||||
|             <scope>test</scope> | ||||
|         </dependency> | ||||
|         <dependency> | ||||
|             <groupId>org.hamcrest</groupId> | ||||
|             <artifactId>hamcrest-all</artifactId> | ||||
|             <version>1.3</version> | ||||
|             <version>1.1</version> | ||||
|             <scope>test</scope> | ||||
|         </dependency> | ||||
|         <dependency> | ||||
|             <groupId>org.apache.lucene</groupId> | ||||
|             <artifactId>lucene-core</artifactId> | ||||
|             <version>${lucene.version}</version> | ||||
|             <version>5.1.0</version> | ||||
|         </dependency> | ||||
|         <dependency> | ||||
|             <groupId>org.apache.lucene</groupId> | ||||
|             <artifactId>lucene-analysis-common</artifactId> | ||||
|             <version>${lucene.version}</version> | ||||
|             <artifactId>lucene-analyzers-common</artifactId> | ||||
|             <version>5.1.0</version> | ||||
|         </dependency> | ||||
|     </dependencies> | ||||
|  | ||||
| @@ -65,11 +52,11 @@ | ||||
|         <repository> | ||||
|             <id>maven2-repository.dev.java.net</id> | ||||
|             <name>Java.net Repository for Maven</name> | ||||
|             <url>https://download.java.net/maven/2/</url> | ||||
|             <url>http://download.java.net/maven/2/</url> | ||||
|         </repository> | ||||
|         <repository> | ||||
|             <id>bintray</id> | ||||
|             <url>https://dl.bintray.com/akuznetsov/russianmorphology</url> | ||||
|             <url>http://dl.bintray.com/akuznetsov/russianmorphology</url> | ||||
|             <releases> | ||||
|                 <enabled>true</enabled> | ||||
|             </releases> | ||||
| @@ -79,24 +66,12 @@ | ||||
|         </repository> | ||||
|     </repositories> | ||||
|  | ||||
|     <pluginRepositories> | ||||
|         <pluginRepository> | ||||
|             <id>mc-release</id> | ||||
|             <name>maven-license-plugin repository of releases</name> | ||||
|             <url>https://mc-repo.googlecode.com/svn/maven2/releases</url> | ||||
|             <snapshots> | ||||
|                 <enabled>false</enabled> | ||||
|             </snapshots> | ||||
|             <releases> | ||||
|                 <enabled>true</enabled> | ||||
|             </releases> | ||||
|         </pluginRepository> | ||||
|     </pluginRepositories> | ||||
|  | ||||
|     <build> | ||||
|         <plugins> | ||||
|             <plugin> | ||||
|                 <artifactId>maven-release-plugin</artifactId> | ||||
|                 <version>2.5.3</version> | ||||
|                 <version>2.5.2</version> | ||||
|                 <configuration> | ||||
|                     <useReleaseProfile>false</useReleaseProfile> | ||||
|                     <releaseProfiles>release</releaseProfiles> | ||||
| @@ -107,37 +82,42 @@ | ||||
|             <plugin> | ||||
|                 <groupId>org.apache.maven.plugins</groupId> | ||||
|                 <artifactId>maven-compiler-plugin</artifactId> | ||||
|                 <version>3.8.1</version> | ||||
|                 <configuration> | ||||
|                     <source>11</source> | ||||
|                     <target>11</target> | ||||
|                     <source>1.7</source> | ||||
|                     <target>1.7</target> | ||||
|                 </configuration> | ||||
|             </plugin> | ||||
|             <plugin>                <!--                 usage: http://code.google.com/p/maven-license-plugin/wiki/HowTo                --> | ||||
|                 <artifactId>maven-license-plugin</artifactId> | ||||
|                 <groupId>com.google.code.maven-license-plugin</groupId> | ||||
|                 <version>1.4.0</version> | ||||
|                 <configuration> | ||||
|                     <basedir>${project.parent.basedir}</basedir> | ||||
|                     <header>etc/header.txt</header> | ||||
|                     <excludes> | ||||
|                         <exclude>**/*.txt</exclude> | ||||
|                         <exclude>**/*.info</exclude> | ||||
|                         <exclude>**/pom.xml</exclude> | ||||
|                     </excludes> | ||||
|                     <includes> | ||||
|                         <include>**/src/**</include> | ||||
|                     </includes> | ||||
|                 </configuration> | ||||
|                 <executions> | ||||
|                     <execution> | ||||
|                         <phase>test</phase> | ||||
|                         <goals> | ||||
|                             <goal>check</goal> | ||||
|                         </goals> | ||||
|                     </execution> | ||||
|                 </executions> | ||||
|             </plugin> | ||||
|             <!--<plugin>                <!–                 usage: http://code.google.com/p/maven-license-plugin/wiki/HowTo                –>--> | ||||
|  | ||||
|                 <!--<groupId>com.mycila</groupId>--> | ||||
|                 <!--<artifactId>license-maven-plugin</artifactId>--> | ||||
|                 <!--<version>2.11</version>--> | ||||
|  | ||||
|                 <!--<configuration>--> | ||||
|                     <!--<properties>--> | ||||
|                         <!--<owner>Alexander Kuznetsov</owner>--> | ||||
|                         <!--<!–<email>mathieu.carbou@gmail.com</email>–>--> | ||||
|                     <!--</properties>--> | ||||
|                     <!--<basedir>${project.parent.basedir}</basedir>--> | ||||
|                     <!--<header>etc/header.txt</header>--> | ||||
|                     <!--<excludes>--> | ||||
|                         <!--<exclude>**/*.txt</exclude>--> | ||||
|                         <!--<exclude>**/*.info</exclude>--> | ||||
|                         <!--<exclude>**/pom.xml</exclude>--> | ||||
|                     <!--</excludes>--> | ||||
|                     <!--<includes>--> | ||||
|                         <!--<include>**/src/**</include>--> | ||||
|                     <!--</includes>--> | ||||
|                 <!--</configuration>--> | ||||
|                 <!--<executions>--> | ||||
|                     <!--<execution>--> | ||||
|                         <!--<phase>test</phase>--> | ||||
|                         <!--<goals>--> | ||||
|                             <!--<goal>check</goal>--> | ||||
|                         <!--</goals>--> | ||||
|                     <!--</execution>--> | ||||
|                 <!--</executions>--> | ||||
|             <!--</plugin>--> | ||||
|         </plugins> | ||||
|     </build> | ||||
|     <profiles> | ||||
| @@ -147,7 +127,6 @@ | ||||
|                 <plugins> | ||||
|                     <plugin> | ||||
|                         <artifactId>maven-source-plugin</artifactId> | ||||
|                         <version>3.2.1</version> | ||||
|                         <executions> | ||||
|                             <execution> | ||||
|                                 <id>attach-sources</id> | ||||
| @@ -159,7 +138,6 @@ | ||||
|                     </plugin> | ||||
|                     <plugin> | ||||
|                         <artifactId>maven-javadoc-plugin</artifactId> | ||||
|                         <version>3.3.1</version> | ||||
|                         <executions> | ||||
|                             <execution> | ||||
|                                 <id>attach-javadocs</id> | ||||
| @@ -178,6 +156,6 @@ | ||||
|         <module>dictionary-reader</module> | ||||
|         <module>russian</module> | ||||
|         <module>english</module> | ||||
|         <module>solr-morphology-analysis</module> | ||||
|         <module>context</module> | ||||
|     </modules> | ||||
| </project> | ||||
| @@ -3,12 +3,13 @@ | ||||
|     <parent> | ||||
|         <artifactId>morphology</artifactId> | ||||
|         <groupId>org.apache.lucene.morphology</groupId> | ||||
|         <version>1.5</version> | ||||
|         <version>1.2-SNAPSHOT</version> | ||||
|     </parent> | ||||
|     <modelVersion>4.0.0</modelVersion> | ||||
|     <groupId>org.apache.lucene.morphology</groupId> | ||||
|     <artifactId>russian</artifactId> | ||||
|     <name>russian</name> | ||||
|     <version>1.5</version> | ||||
|     <version>1.2-SNAPSHOT</version> | ||||
|     <url>http://maven.apache.org</url> | ||||
|     <dependencies> | ||||
|  | ||||
| @@ -16,13 +17,13 @@ | ||||
|         <dependency> | ||||
|             <groupId>org.apache.lucene.morphology</groupId> | ||||
|             <artifactId>morph</artifactId> | ||||
|             <version>1.5</version> | ||||
|             <version>1.2-SNAPSHOT</version> | ||||
|         </dependency> | ||||
|  | ||||
|         <dependency> | ||||
|             <groupId>junit</groupId> | ||||
|             <artifactId>junit</artifactId> | ||||
|             <version>${junit.version}</version> | ||||
|             <version>4.8.2</version> | ||||
|             <scope>test</scope> | ||||
|         </dependency> | ||||
|  | ||||
|   | ||||
| @@ -20,6 +20,7 @@ import org.apache.lucene.morphology.LetterDecoderEncoder; | ||||
| import org.apache.lucene.morphology.SuffixToLongException; | ||||
| import org.apache.lucene.morphology.WrongCharaterException; | ||||
|  | ||||
| import java.util.ArrayList; | ||||
| import java.util.LinkedList; | ||||
|  | ||||
| /** | ||||
| @@ -41,7 +42,7 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { | ||||
|             throw new SuffixToLongException("Suffix length should not be greater then " + WORD_PART_LENGHT + " " + string); | ||||
|         int result = 0; | ||||
|         for (int i = 0; i < string.length(); i++) { | ||||
|             int c = string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET; | ||||
|             int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET; | ||||
|             if (c == 45 - RUSSIAN_SMALL_LETTER_OFFSET) { | ||||
|                 c = DASH_CODE; | ||||
|             } | ||||
| @@ -57,7 +58,7 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { | ||||
|     } | ||||
|  | ||||
|     public int[] encodeToArray(String s) { | ||||
|         LinkedList<Integer> integers = new LinkedList<>(); | ||||
|         LinkedList<Integer> integers = new LinkedList<Integer>(); | ||||
|         while (s.length() > WORD_PART_LENGHT) { | ||||
|             integers.add(encode(s.substring(0, WORD_PART_LENGHT))); | ||||
|             s = s.substring(WORD_PART_LENGHT); | ||||
| @@ -73,16 +74,16 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { | ||||
|     } | ||||
|  | ||||
|     public String decodeArray(int[] array) { | ||||
|         StringBuilder result = new StringBuilder(); | ||||
|         String result = ""; | ||||
|         for (int i : array) { | ||||
|             result.append(decode(i)); | ||||
|             result += decode(i); | ||||
|         } | ||||
|         return result.toString(); | ||||
|         return result; | ||||
|     } | ||||
|  | ||||
|  | ||||
|     public String decode(Integer suffixN) { | ||||
|         StringBuilder result = new StringBuilder(); | ||||
|         String result = ""; | ||||
|         while (suffixN > 33) { | ||||
|             int c = suffixN % 34 + RUSSIAN_SMALL_LETTER_OFFSET; | ||||
|             if (c == RUSSIAN_SMALL_LETTER_OFFSET) { | ||||
| @@ -90,20 +91,21 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { | ||||
|                 continue; | ||||
|             } | ||||
|             if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR; | ||||
|             result.insert(0, (char) c); | ||||
|             result = (char) c + result; | ||||
|             suffixN /= 34; | ||||
|         } | ||||
|         long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET; | ||||
|         if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR; | ||||
|         result.insert(0, (char) c); | ||||
|         return result.toString(); | ||||
|         result = (char) c + result; | ||||
|         return result; | ||||
|     } | ||||
|  | ||||
|     public boolean checkCharacter(char c) { | ||||
|         int code = c; | ||||
|         int code = 0 + c; | ||||
|         if (code == 45) return true; | ||||
|         code -= RUSSIAN_SMALL_LETTER_OFFSET; | ||||
|         return code > 0 && code < 33; | ||||
|         if (code > 0 && code < 33) return true; | ||||
|         return false; | ||||
|     } | ||||
|  | ||||
|     public boolean checkString(String word) { | ||||
|   | ||||
| @@ -17,7 +17,6 @@ package org.apache.lucene.morphology.russian; | ||||
|  | ||||
| import org.apache.lucene.morphology.SuffixToLongException; | ||||
| import org.apache.lucene.morphology.WrongCharaterException; | ||||
| import org.hamcrest.MatcherAssert; | ||||
| import org.junit.Before; | ||||
| import org.junit.Test; | ||||
|  | ||||
| @@ -25,9 +24,9 @@ import java.io.BufferedReader; | ||||
| import java.io.IOException; | ||||
| import java.io.InputStream; | ||||
| import java.io.InputStreamReader; | ||||
| import java.nio.charset.StandardCharsets; | ||||
|  | ||||
| import static org.hamcrest.core.IsEqual.equalTo; | ||||
| import static org.junit.Assert.assertThat; | ||||
|  | ||||
| public class RussianLetterDecoderEncoderTest { | ||||
|     private RussianLetterDecoderEncoder decoderEncoder; | ||||
| @@ -41,12 +40,12 @@ public class RussianLetterDecoderEncoderTest { | ||||
|     @Test | ||||
|     public void testShouldPreserverStringComporision() throws IOException { | ||||
|         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-monotonic.txt"); | ||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); | ||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); | ||||
|         String s = bufferedReader.readLine(); | ||||
|         while (s != null) { | ||||
|             String[] qa = s.trim().split(" "); | ||||
|             if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT && qa[1].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) { | ||||
|                 MatcherAssert.assertThat(decoderEncoder.encode(qa[1]) > decoderEncoder.encode(qa[0]), equalTo(true)); | ||||
|                 assertThat(decoderEncoder.encode(qa[1]) > decoderEncoder.encode(qa[0]), equalTo(true)); | ||||
|             } | ||||
|             s = bufferedReader.readLine(); | ||||
|         } | ||||
| @@ -56,13 +55,13 @@ public class RussianLetterDecoderEncoderTest { | ||||
|     @Test | ||||
|     public void testShouldCorrectDecodeEncode() throws IOException { | ||||
|         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data.txt"); | ||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); | ||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); | ||||
|         String s = bufferedReader.readLine(); | ||||
|         while (s != null) { | ||||
|             String[] qa = s.trim().split(" "); | ||||
|             if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) { | ||||
|                 Integer encodedSuffix = decoderEncoder.encode(qa[0]); | ||||
|                 MatcherAssert.assertThat(decoderEncoder.decode(encodedSuffix), equalTo(qa[1])); | ||||
|                 assertThat(decoderEncoder.decode(encodedSuffix), equalTo(qa[1])); | ||||
|             } | ||||
|             s = bufferedReader.readLine(); | ||||
|         } | ||||
| @@ -71,12 +70,12 @@ public class RussianLetterDecoderEncoderTest { | ||||
|     @Test | ||||
|     public void testShouldCorrectDecodeEncodeStringToArray() throws IOException { | ||||
|         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data-for-array.txt"); | ||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); | ||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); | ||||
|         String s = bufferedReader.readLine(); | ||||
|         while (s != null) { | ||||
|             String[] qa = s.trim().split(" "); | ||||
|             int[] ecodedSuffix = decoderEncoder.encodeToArray(qa[0]); | ||||
|             MatcherAssert.assertThat(decoderEncoder.decodeArray(ecodedSuffix), equalTo(qa[1])); | ||||
|             assertThat(decoderEncoder.decodeArray(ecodedSuffix), equalTo(qa[1])); | ||||
|             s = bufferedReader.readLine(); | ||||
|         } | ||||
|     } | ||||
|   | ||||
| @@ -1,70 +0,0 @@ | ||||
| /** | ||||
|  * Copyright 2009 Alexander Kuznetsov | ||||
|  * | ||||
|  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|  * you may not use this file except in compliance with the License. | ||||
|  * You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.lucene.analysis.morphology; | ||||
|  | ||||
| import org.apache.lucene.analysis.TokenFilterFactory; | ||||
| import org.apache.lucene.analysis.TokenStream; | ||||
|  | ||||
| import org.apache.lucene.morphology.LuceneMorphology; | ||||
| import org.apache.lucene.morphology.analyzer.MorphologyFilter; | ||||
| import org.apache.lucene.util.ResourceLoader; | ||||
| import org.apache.lucene.util.ResourceLoaderAware; | ||||
|  | ||||
| import java.util.Map; | ||||
|  | ||||
| /** | ||||
|  * Factory for {@link MorphologyFilter}, with configurable language | ||||
|  * <p> | ||||
|  * <b>Note:</b> Two languages are available now: English (default value) and Russian. | ||||
|  * <pre class="prettyprint"> | ||||
|  * <fieldType name="content" class="solr.TextField" positionIncrementGap="100"> | ||||
|  *   <analyzer> | ||||
|  *     <tokenizer class="solr.StandardTokenizerFactory"/> | ||||
|  *     <filter class="solr.LowerCaseFilterFactory"/> | ||||
|  *     <filter class="solr.MorphologyFilterFactory" language="English"/> | ||||
|  *   </analyzer> | ||||
|  * </fieldType></pre> | ||||
|  */ | ||||
| public class MorphologyFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { | ||||
|  | ||||
|     private static final String LANGUAGE_KEY = "language"; | ||||
|  | ||||
|     private String language; | ||||
|     private LuceneMorphology luceneMorphology; | ||||
|  | ||||
|     public MorphologyFilterFactory(Map<String, String> args) { | ||||
|         super(args); | ||||
|  | ||||
|         language = get(args, LANGUAGE_KEY, "English"); | ||||
|         if (!args.isEmpty()) { | ||||
|             throw new IllegalArgumentException("Unknown parameters: " + args); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     public TokenStream create(TokenStream input) { | ||||
|         return new MorphologyFilter(input, luceneMorphology); | ||||
|     } | ||||
|  | ||||
|     public void inform(ResourceLoader loader) { | ||||
|  | ||||
|         String className = "org.apache.lucene.morphology." + language.toLowerCase() + "." + language + "LuceneMorphology"; | ||||
|         luceneMorphology = loader.newInstance(className, LuceneMorphology.class); | ||||
|     } | ||||
|  | ||||
|     public LuceneMorphology getLuceneMorphology() { | ||||
|         return luceneMorphology; | ||||
|     } | ||||
| } | ||||
| @@ -1,75 +0,0 @@ | ||||
| /** | ||||
|  * Copyright 2009 Alexander Kuznetsov | ||||
|  * | ||||
|  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|  * you may not use this file except in compliance with the License. | ||||
|  * You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.lucene.analysis.morphology; | ||||
|  | ||||
| import org.apache.lucene.morphology.LuceneMorphology; | ||||
| import org.apache.lucene.morphology.english.EnglishLuceneMorphology; | ||||
| import org.apache.lucene.morphology.russian.RussianLuceneMorphology; | ||||
| import org.apache.lucene.util.ClasspathResourceLoader; | ||||
| import org.apache.lucene.util.ResourceLoader; | ||||
| import org.junit.Assert; | ||||
| import org.junit.Before; | ||||
| import org.junit.Test; | ||||
|  | ||||
| import java.util.HashMap; | ||||
| import java.util.Map; | ||||
|  | ||||
| public class MorphologyFilterFactoryTest { | ||||
|  | ||||
|     private static final String LANGUAGE_KEY = "language"; | ||||
|     private ResourceLoader loader = new ClasspathResourceLoader(MorphologyFilterFactoryTest.class); | ||||
|     private Map<String, String> args; | ||||
|  | ||||
|     @Before | ||||
|     public void setUp() { | ||||
|         args = new HashMap<>(); | ||||
|     } | ||||
|  | ||||
|     @Test | ||||
|     public void if_RussianLanguageKey_then_CreateRussianMorphologyFilter() { | ||||
|  | ||||
|         args.put(LANGUAGE_KEY, "Russian"); | ||||
|         MorphologyFilterFactory morphologyFilterFactory = new MorphologyFilterFactory(args); | ||||
|         morphologyFilterFactory.inform(loader); | ||||
|  | ||||
|         LuceneMorphology luceneMorphology = morphologyFilterFactory.getLuceneMorphology(); | ||||
|  | ||||
|         Assert.assertTrue("Creation the MorphologyFilterFactory with a Russian language key", luceneMorphology instanceof RussianLuceneMorphology); | ||||
|     } | ||||
|  | ||||
|     @Test | ||||
|     public void if_EnglishLanguageKey_then_CreateEnglishMorphologyFilter() { | ||||
|  | ||||
|         args.put(LANGUAGE_KEY, "English"); | ||||
|         MorphologyFilterFactory morphologyFilterFactory = new MorphologyFilterFactory(args); | ||||
|         morphologyFilterFactory.inform(loader); | ||||
|  | ||||
|         LuceneMorphology luceneMorphology = morphologyFilterFactory.getLuceneMorphology(); | ||||
|  | ||||
|         Assert.assertTrue("Creation the MorphologyFilterFactory with a English language key", luceneMorphology instanceof EnglishLuceneMorphology); | ||||
|     } | ||||
|  | ||||
|     @Test | ||||
|     public void if_NoLanguageKey_then_CreateEnglishMorphologyFilter() { | ||||
|  | ||||
|         MorphologyFilterFactory morphologyFilterFactory = new MorphologyFilterFactory(args); | ||||
|         morphologyFilterFactory.inform(loader); | ||||
|  | ||||
|         LuceneMorphology luceneMorphology = morphologyFilterFactory.getLuceneMorphology(); | ||||
|  | ||||
|         Assert.assertTrue("Creation the MorphologyFilterFactory without any language keys", luceneMorphology instanceof EnglishLuceneMorphology); | ||||
|     } | ||||
| } | ||||
		Reference in New Issue
	
	Block a user