Compare commits
	
		
			2 Commits
		
	
	
		
			devel
			...
			ambiguousl
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | f095cbe7c0 | ||
|   | 3b2e48821a | 
| @@ -1,55 +0,0 @@ | |||||||
| name: Gitea Action Maven Build |  | ||||||
|  |  | ||||||
| on: |  | ||||||
|   push: |  | ||||||
|     branches: ['devel'] |  | ||||||
|  |  | ||||||
| jobs: |  | ||||||
|   build: |  | ||||||
|     runs-on: ubuntu-latest |  | ||||||
|     steps: |  | ||||||
|       - name: Checkout the repo |  | ||||||
|         uses: actions/checkout@v4 |  | ||||||
|  |  | ||||||
|       - name: Set up Maven |  | ||||||
|         uses: stCarolas/setup-maven@v5 |  | ||||||
|         with: |  | ||||||
|           maven-version: 3.9.9 |  | ||||||
|  |  | ||||||
|       - name: Set up JDK 21 |  | ||||||
|         uses: actions/setup-java@v4 |  | ||||||
|         with: |  | ||||||
|           java-version: 21 |  | ||||||
|           distribution: 'oracle' |  | ||||||
|           cache: 'maven' |  | ||||||
|  |  | ||||||
|       - name: Build, Test and Package with Maven |  | ||||||
|         run: mvn -B verify --file pom.xml -e |  | ||||||
|  |  | ||||||
|       - name: Upload artifact |  | ||||||
|         uses: actions/upload-artifact@v3 |  | ||||||
|         with: |  | ||||||
|           name: russianmorphology-artifact |  | ||||||
|           path: | |  | ||||||
|             dictionary-reader/target/dictionary-reader-1.5.jar |  | ||||||
|             english/target/english-1.5.jar |  | ||||||
|             russian/target/russian-1.5.jar |  | ||||||
|             morph/target/morph-1.5.jar |  | ||||||
|             solr-morphology-analysis/target/morphology-1.5.jar |  | ||||||
|  |  | ||||||
|       - name: Create Maven settings.xml |  | ||||||
|         run: | |  | ||||||
|           cat <<EOF > ~/.m2/settings.xml |  | ||||||
|           <settings> |  | ||||||
|             <servers> |  | ||||||
|               <server> |  | ||||||
|                 <id>gitea</id> |  | ||||||
|                 <username>${{ vars.OWNER }}</username> |  | ||||||
|                 <password>${{ secrets.ACCESS_TOKEN }}</password> |  | ||||||
|               </server> |  | ||||||
|             </servers> |  | ||||||
|           </settings> |  | ||||||
|           EOF |  | ||||||
|  |  | ||||||
|       - name: Deploy to Gitea Packages |  | ||||||
|         run: mvn deploy |  | ||||||
							
								
								
									
										35
									
								
								.github/workflows/main.yaml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										35
									
								
								.github/workflows/main.yaml
									
									
									
									
										vendored
									
									
								
							| @@ -1,35 +0,0 @@ | |||||||
| name: Java CI |  | ||||||
|  |  | ||||||
| on: [push, pull_request] |  | ||||||
|  |  | ||||||
| jobs: |  | ||||||
|   tests: |  | ||||||
|     runs-on: ubuntu-latest |  | ||||||
|  |  | ||||||
|     steps: |  | ||||||
|       - uses: actions/checkout@v2 |  | ||||||
|       - name: Set up JDK 11 |  | ||||||
|         uses: actions/setup-java@v2 |  | ||||||
|         with: |  | ||||||
|           java-version: '11' |  | ||||||
|           distribution: 'adopt' |  | ||||||
|       - name: Build with Maven |  | ||||||
|         run: mvn --batch-mode --update-snapshots verify |  | ||||||
|            |  | ||||||
|   pack-artifacts: |  | ||||||
|     runs-on: ubuntu-latest |  | ||||||
|     needs: tests |  | ||||||
|     if: github.ref == 'refs/heads/master' |  | ||||||
|     steps: |  | ||||||
|       - uses: actions/checkout@v2 |  | ||||||
|       - name: Set up JDK 11 |  | ||||||
|         uses: actions/setup-java@v2 |  | ||||||
|         with: |  | ||||||
|           java-version: '11' |  | ||||||
|           distribution: 'adopt' |  | ||||||
|       - name: Build with Maven |  | ||||||
|         run: mvn --batch-mode --update-snapshots verify |  | ||||||
|       - uses: actions/upload-artifact@v2 |  | ||||||
|         with: |  | ||||||
|           name: artifacts |  | ||||||
|           path: ${{ github.workspace }}/*/target/*.jar |  | ||||||
							
								
								
									
										202
									
								
								LICENSE
									
									
									
									
									
								
							
							
						
						
									
										202
									
								
								LICENSE
									
									
									
									
									
								
							| @@ -1,202 +0,0 @@ | |||||||
|  |  | ||||||
|                                  Apache License |  | ||||||
|                            Version 2.0, January 2004 |  | ||||||
|                         http://www.apache.org/licenses/ |  | ||||||
|  |  | ||||||
|    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION |  | ||||||
|  |  | ||||||
|    1. Definitions. |  | ||||||
|  |  | ||||||
|       "License" shall mean the terms and conditions for use, reproduction, |  | ||||||
|       and distribution as defined by Sections 1 through 9 of this document. |  | ||||||
|  |  | ||||||
|       "Licensor" shall mean the copyright owner or entity authorized by |  | ||||||
|       the copyright owner that is granting the License. |  | ||||||
|  |  | ||||||
|       "Legal Entity" shall mean the union of the acting entity and all |  | ||||||
|       other entities that control, are controlled by, or are under common |  | ||||||
|       control with that entity. For the purposes of this definition, |  | ||||||
|       "control" means (i) the power, direct or indirect, to cause the |  | ||||||
|       direction or management of such entity, whether by contract or |  | ||||||
|       otherwise, or (ii) ownership of fifty percent (50%) or more of the |  | ||||||
|       outstanding shares, or (iii) beneficial ownership of such entity. |  | ||||||
|  |  | ||||||
|       "You" (or "Your") shall mean an individual or Legal Entity |  | ||||||
|       exercising permissions granted by this License. |  | ||||||
|  |  | ||||||
|       "Source" form shall mean the preferred form for making modifications, |  | ||||||
|       including but not limited to software source code, documentation |  | ||||||
|       source, and configuration files. |  | ||||||
|  |  | ||||||
|       "Object" form shall mean any form resulting from mechanical |  | ||||||
|       transformation or translation of a Source form, including but |  | ||||||
|       not limited to compiled object code, generated documentation, |  | ||||||
|       and conversions to other media types. |  | ||||||
|  |  | ||||||
|       "Work" shall mean the work of authorship, whether in Source or |  | ||||||
|       Object form, made available under the License, as indicated by a |  | ||||||
|       copyright notice that is included in or attached to the work |  | ||||||
|       (an example is provided in the Appendix below). |  | ||||||
|  |  | ||||||
|       "Derivative Works" shall mean any work, whether in Source or Object |  | ||||||
|       form, that is based on (or derived from) the Work and for which the |  | ||||||
|       editorial revisions, annotations, elaborations, or other modifications |  | ||||||
|       represent, as a whole, an original work of authorship. For the purposes |  | ||||||
|       of this License, Derivative Works shall not include works that remain |  | ||||||
|       separable from, or merely link (or bind by name) to the interfaces of, |  | ||||||
|       the Work and Derivative Works thereof. |  | ||||||
|  |  | ||||||
|       "Contribution" shall mean any work of authorship, including |  | ||||||
|       the original version of the Work and any modifications or additions |  | ||||||
|       to that Work or Derivative Works thereof, that is intentionally |  | ||||||
|       submitted to Licensor for inclusion in the Work by the copyright owner |  | ||||||
|       or by an individual or Legal Entity authorized to submit on behalf of |  | ||||||
|       the copyright owner. For the purposes of this definition, "submitted" |  | ||||||
|       means any form of electronic, verbal, or written communication sent |  | ||||||
|       to the Licensor or its representatives, including but not limited to |  | ||||||
|       communication on electronic mailing lists, source code control systems, |  | ||||||
|       and issue tracking systems that are managed by, or on behalf of, the |  | ||||||
|       Licensor for the purpose of discussing and improving the Work, but |  | ||||||
|       excluding communication that is conspicuously marked or otherwise |  | ||||||
|       designated in writing by the copyright owner as "Not a Contribution." |  | ||||||
|  |  | ||||||
|       "Contributor" shall mean Licensor and any individual or Legal Entity |  | ||||||
|       on behalf of whom a Contribution has been received by Licensor and |  | ||||||
|       subsequently incorporated within the Work. |  | ||||||
|  |  | ||||||
|    2. Grant of Copyright License. Subject to the terms and conditions of |  | ||||||
|       this License, each Contributor hereby grants to You a perpetual, |  | ||||||
|       worldwide, non-exclusive, no-charge, royalty-free, irrevocable |  | ||||||
|       copyright license to reproduce, prepare Derivative Works of, |  | ||||||
|       publicly display, publicly perform, sublicense, and distribute the |  | ||||||
|       Work and such Derivative Works in Source or Object form. |  | ||||||
|  |  | ||||||
|    3. Grant of Patent License. Subject to the terms and conditions of |  | ||||||
|       this License, each Contributor hereby grants to You a perpetual, |  | ||||||
|       worldwide, non-exclusive, no-charge, royalty-free, irrevocable |  | ||||||
|       (except as stated in this section) patent license to make, have made, |  | ||||||
|       use, offer to sell, sell, import, and otherwise transfer the Work, |  | ||||||
|       where such license applies only to those patent claims licensable |  | ||||||
|       by such Contributor that are necessarily infringed by their |  | ||||||
|       Contribution(s) alone or by combination of their Contribution(s) |  | ||||||
|       with the Work to which such Contribution(s) was submitted. If You |  | ||||||
|       institute patent litigation against any entity (including a |  | ||||||
|       cross-claim or counterclaim in a lawsuit) alleging that the Work |  | ||||||
|       or a Contribution incorporated within the Work constitutes direct |  | ||||||
|       or contributory patent infringement, then any patent licenses |  | ||||||
|       granted to You under this License for that Work shall terminate |  | ||||||
|       as of the date such litigation is filed. |  | ||||||
|  |  | ||||||
|    4. Redistribution. You may reproduce and distribute copies of the |  | ||||||
|       Work or Derivative Works thereof in any medium, with or without |  | ||||||
|       modifications, and in Source or Object form, provided that You |  | ||||||
|       meet the following conditions: |  | ||||||
|  |  | ||||||
|       (a) You must give any other recipients of the Work or |  | ||||||
|           Derivative Works a copy of this License; and |  | ||||||
|  |  | ||||||
|       (b) You must cause any modified files to carry prominent notices |  | ||||||
|           stating that You changed the files; and |  | ||||||
|  |  | ||||||
|       (c) You must retain, in the Source form of any Derivative Works |  | ||||||
|           that You distribute, all copyright, patent, trademark, and |  | ||||||
|           attribution notices from the Source form of the Work, |  | ||||||
|           excluding those notices that do not pertain to any part of |  | ||||||
|           the Derivative Works; and |  | ||||||
|  |  | ||||||
|       (d) If the Work includes a "NOTICE" text file as part of its |  | ||||||
|           distribution, then any Derivative Works that You distribute must |  | ||||||
|           include a readable copy of the attribution notices contained |  | ||||||
|           within such NOTICE file, excluding those notices that do not |  | ||||||
|           pertain to any part of the Derivative Works, in at least one |  | ||||||
|           of the following places: within a NOTICE text file distributed |  | ||||||
|           as part of the Derivative Works; within the Source form or |  | ||||||
|           documentation, if provided along with the Derivative Works; or, |  | ||||||
|           within a display generated by the Derivative Works, if and |  | ||||||
|           wherever such third-party notices normally appear. The contents |  | ||||||
|           of the NOTICE file are for informational purposes only and |  | ||||||
|           do not modify the License. You may add Your own attribution |  | ||||||
|           notices within Derivative Works that You distribute, alongside |  | ||||||
|           or as an addendum to the NOTICE text from the Work, provided |  | ||||||
|           that such additional attribution notices cannot be construed |  | ||||||
|           as modifying the License. |  | ||||||
|  |  | ||||||
|       You may add Your own copyright statement to Your modifications and |  | ||||||
|       may provide additional or different license terms and conditions |  | ||||||
|       for use, reproduction, or distribution of Your modifications, or |  | ||||||
|       for any such Derivative Works as a whole, provided Your use, |  | ||||||
|       reproduction, and distribution of the Work otherwise complies with |  | ||||||
|       the conditions stated in this License. |  | ||||||
|  |  | ||||||
|    5. Submission of Contributions. Unless You explicitly state otherwise, |  | ||||||
|       any Contribution intentionally submitted for inclusion in the Work |  | ||||||
|       by You to the Licensor shall be under the terms and conditions of |  | ||||||
|       this License, without any additional terms or conditions. |  | ||||||
|       Notwithstanding the above, nothing herein shall supersede or modify |  | ||||||
|       the terms of any separate license agreement you may have executed |  | ||||||
|       with Licensor regarding such Contributions. |  | ||||||
|  |  | ||||||
|    6. Trademarks. This License does not grant permission to use the trade |  | ||||||
|       names, trademarks, service marks, or product names of the Licensor, |  | ||||||
|       except as required for reasonable and customary use in describing the |  | ||||||
|       origin of the Work and reproducing the content of the NOTICE file. |  | ||||||
|  |  | ||||||
|    7. Disclaimer of Warranty. Unless required by applicable law or |  | ||||||
|       agreed to in writing, Licensor provides the Work (and each |  | ||||||
|       Contributor provides its Contributions) on an "AS IS" BASIS, |  | ||||||
|       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |  | ||||||
|       implied, including, without limitation, any warranties or conditions |  | ||||||
|       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A |  | ||||||
|       PARTICULAR PURPOSE. You are solely responsible for determining the |  | ||||||
|       appropriateness of using or redistributing the Work and assume any |  | ||||||
|       risks associated with Your exercise of permissions under this License. |  | ||||||
|  |  | ||||||
|    8. Limitation of Liability. In no event and under no legal theory, |  | ||||||
|       whether in tort (including negligence), contract, or otherwise, |  | ||||||
|       unless required by applicable law (such as deliberate and grossly |  | ||||||
|       negligent acts) or agreed to in writing, shall any Contributor be |  | ||||||
|       liable to You for damages, including any direct, indirect, special, |  | ||||||
|       incidental, or consequential damages of any character arising as a |  | ||||||
|       result of this License or out of the use or inability to use the |  | ||||||
|       Work (including but not limited to damages for loss of goodwill, |  | ||||||
|       work stoppage, computer failure or malfunction, or any and all |  | ||||||
|       other commercial damages or losses), even if such Contributor |  | ||||||
|       has been advised of the possibility of such damages. |  | ||||||
|  |  | ||||||
|    9. Accepting Warranty or Additional Liability. While redistributing |  | ||||||
|       the Work or Derivative Works thereof, You may choose to offer, |  | ||||||
|       and charge a fee for, acceptance of support, warranty, indemnity, |  | ||||||
|       or other liability obligations and/or rights consistent with this |  | ||||||
|       License. However, in accepting such obligations, You may act only |  | ||||||
|       on Your own behalf and on Your sole responsibility, not on behalf |  | ||||||
|       of any other Contributor, and only if You agree to indemnify, |  | ||||||
|       defend, and hold each Contributor harmless for any liability |  | ||||||
|       incurred by, or claims asserted against, such Contributor by reason |  | ||||||
|       of your accepting any such warranty or additional liability. |  | ||||||
|  |  | ||||||
|    END OF TERMS AND CONDITIONS |  | ||||||
|  |  | ||||||
|    APPENDIX: How to apply the Apache License to your work. |  | ||||||
|  |  | ||||||
|       To apply the Apache License to your work, attach the following |  | ||||||
|       boilerplate notice, with the fields enclosed by brackets "[]" |  | ||||||
|       replaced with your own identifying information. (Don't include |  | ||||||
|       the brackets!)  The text should be enclosed in the appropriate |  | ||||||
|       comment syntax for the file format. We also recommend that a |  | ||||||
|       file or class name and description of purpose be included on the |  | ||||||
|       same "printed page" as the copyright notice for easier |  | ||||||
|       identification within third-party archives. |  | ||||||
|  |  | ||||||
|    Copyright [yyyy] [name of copyright owner] |  | ||||||
|  |  | ||||||
|    Licensed under the Apache License, Version 2.0 (the "License"); |  | ||||||
|    you may not use this file except in compliance with the License. |  | ||||||
|    You may obtain a copy of the License at |  | ||||||
|  |  | ||||||
|        http://www.apache.org/licenses/LICENSE-2.0 |  | ||||||
|  |  | ||||||
|    Unless required by applicable law or agreed to in writing, software |  | ||||||
|    distributed under the License is distributed on an "AS IS" BASIS, |  | ||||||
|    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |  | ||||||
|    See the License for the specific language governing permissions and |  | ||||||
|    limitations under the License. |  | ||||||
							
								
								
									
										64
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										64
									
								
								README.md
									
									
									
									
									
								
							| @@ -1,17 +1,54 @@ | |||||||
| # Russian Morphology for Apache Lucene | # Russian Morphology for lucene | ||||||
|  |  | ||||||
| Russian and English morphology for Java and [Apache Lucene](http://lucene.apache.org) 9.3 framework based on open source dictionary from site [АОТ](http://aot.ru). It uses dictionary base morphology with some heuristics for unknown words. It supports a homonym for example for a Russian word "вина" it gives two variants "вино" and "вина". | Russian and English morphology for java and lucene 3.0 framework based on open source dictionary from site [АОТ](http://aot.ru). It use dictionary base morphology with some heuristics for unknown words. It support homonym for example for Russian word "вина" it gives two variants "вино" and "вина".  | ||||||
|  |  | ||||||
|  |  | ||||||
| ### How to use | ### How to use | ||||||
|  |  | ||||||
| Build project, by running `mvn clean package`, this will provide you the latest versions of the artifacts - 1.5, add it to your classpath. You could select which version to use - Russian or English. | First download  | ||||||
|  | [morph-1.0.jar](https://bintray.com/artifact/download/akuznetsov/russianmorphology/org/apache/lucene/morphology/morph/1.1/morph-1.1.jar)   | ||||||
|  | and add it to your class path. When download [Russian](https://bintray.com/artifact/download/akuznetsov/russianmorphology/org/apache/lucene/morphology/russian/1.1/russian-1.1.jar) or  | ||||||
|  | [English](https://bintray.com/artifact/download/akuznetsov/russianmorphology/org/apache/lucene/morphology/english/1.1/english-1.1.jar) package.  | ||||||
|  |  | ||||||
|  | If you use maven you can add dependency  | ||||||
|  |  | ||||||
|  |         <dependency> | ||||||
|  |             <groupId>org.apache.lucene.morphology</groupId> | ||||||
|  |             <artifactId>russian</artifactId> | ||||||
|  |             <version>1.1</version> | ||||||
|  |         </dependency> | ||||||
|  |  | ||||||
|  |  | ||||||
|  |         <dependency> | ||||||
|  |             <groupId>org.apache.lucene.morphology</groupId> | ||||||
|  |             <artifactId>english</artifactId> | ||||||
|  |             <version>1.1</version> | ||||||
|  |         </dependency> | ||||||
|  |  | ||||||
|  | Don't forget add link to repository | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     <repositories> | ||||||
|  |     ............... | ||||||
|  |       <repository> | ||||||
|  |         <snapshots> | ||||||
|  |           <enabled>false</enabled> | ||||||
|  |         </snapshots> | ||||||
|  |         <id>bintray-akuznetsov-russianmorphology</id> | ||||||
|  |         <name>bintray</name> | ||||||
|  |         <url>http://dl.bintray.com/akuznetsov/russianmorphology</url> | ||||||
|  |       </repository> | ||||||
|  |     </repositories> | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | Now you can create a Lucene Analyzer  | ||||||
|  |  | ||||||
| Now you can create a Lucene Analyzer: |  | ||||||
|  |  | ||||||
|       RussianAnalayzer russian = new RussianAnalayzer(); |       RussianAnalayzer russian = new RussianAnalayzer(); | ||||||
|       EnglishAnalayzer english = new EnglishAnalayzer(); |       EnglishAnalayzer english = new EnglishAnalayzer(); | ||||||
|  |  | ||||||
|  |  | ||||||
| You can write you own analyzer using filter that convert word in it's right forms.  | You can write you own analyzer using filter that convert word in it's right forms.  | ||||||
|  |  | ||||||
|       LuceneMorphology luceneMorph = new EnglishLuceneMorphology(); |       LuceneMorphology luceneMorph = new EnglishLuceneMorphology(); | ||||||
| @@ -25,28 +62,9 @@ Also if you need get a list of base forms of word, you can use following example | |||||||
|      LuceneMorphology luceneMorph = new EnglishLuceneMorphology(); |      LuceneMorphology luceneMorph = new EnglishLuceneMorphology(); | ||||||
|      List<String> wordBaseForms = luceneMorph.getMorphInfo(word); |      List<String> wordBaseForms = luceneMorph.getMorphInfo(word); | ||||||
|  |  | ||||||
| ### Solr |  | ||||||
|  |  | ||||||
| You can use the LuceneMorphology as morphology filter in a Solr _schema.xml_ using a **MorphologyFilterFactory:** |  | ||||||
|  |  | ||||||
| ```xml |  | ||||||
| <fieldType name="content" class="solr.TextField" positionIncrementGap="100"> |  | ||||||
|       <analyzer> |  | ||||||
|         <tokenizer class="solr.StandardTokenizerFactory"/> |  | ||||||
| 		<filter class="org.apache.lucene.analysis.morphology.MorphologyFilterFactory" language="Russian"/> |  | ||||||
| 		<filter class="org.apache.lucene.analysis.morphology.MorphologyFilterFactory" language="English"/> |  | ||||||
|       </analyzer> |  | ||||||
| </fieldType> |  | ||||||
| ``` |  | ||||||
|  |  | ||||||
| Just add _morphology-1.5.jar_ in your Solr lib-directories |  | ||||||
|  |  | ||||||
| ### Restrictions | ### Restrictions | ||||||
|    |    | ||||||
|   * It works only with UTF-8. |   * It works only with UTF-8. | ||||||
|   * It assume what letters е and ё are the same. |   * It assume what letters е and ё are the same. | ||||||
|   * Word forms with prefixes like "наибольший" treated as separate word.  |   * Word forms with prefixes like "наибольший" treated as separate word.  | ||||||
|  |  | ||||||
| ### License |  | ||||||
|  |  | ||||||
| Apache License, Version 2.0 |  | ||||||
|   | |||||||
| @@ -1,40 +1,36 @@ | |||||||
| <?xml version="1.0" encoding="UTF-8"?> | <?xml version="1.0"?> | ||||||
| <project xmlns="http://maven.apache.org/POM/4.0.0" | <project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" | ||||||
|          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |          xmlns="http://maven.apache.org/POM/4.0.0" | ||||||
|          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> | ||||||
|     <parent> |  | ||||||
|         <artifactId>morphology</artifactId> |  | ||||||
|         <groupId>org.apache.lucene.morphology</groupId> |  | ||||||
|         <version>1.5</version> |  | ||||||
|     </parent> |  | ||||||
|     <modelVersion>4.0.0</modelVersion> |     <modelVersion>4.0.0</modelVersion> | ||||||
| 
 |     <parent> | ||||||
|     <groupId>org.apache.lucene.analysis</groupId> |         <groupId>org.apache.lucene.morphology</groupId> | ||||||
|         <artifactId>morphology</artifactId> |         <artifactId>morphology</artifactId> | ||||||
|     <name>solr-morphology-analysis</name> |         <version>1.2-SNAPSHOT</version> | ||||||
|     <version>${morphology.version}</version> |     </parent> | ||||||
|  |     <groupId>org.apache.lucene.morphology</groupId> | ||||||
|  |     <artifactId>context</artifactId> | ||||||
|  |     <version>1.0-SNAPSHOT</version> | ||||||
|  |     <name>context</name> | ||||||
|     <url>http://maven.apache.org</url> |     <url>http://maven.apache.org</url> | ||||||
| 
 |  | ||||||
|     <dependencies> |     <dependencies> | ||||||
| 
 |         <dependency> | ||||||
|  |             <groupId>junit</groupId> | ||||||
|  |             <artifactId>junit</artifactId> | ||||||
|  |             <version>4.8.2</version> | ||||||
|  |             <scope>test</scope> | ||||||
|  |         </dependency> | ||||||
|         <dependency> |         <dependency> | ||||||
|             <groupId>org.apache.lucene.morphology</groupId> |             <groupId>org.apache.lucene.morphology</groupId> | ||||||
|             <artifactId>russian</artifactId> |             <artifactId>russian</artifactId> | ||||||
|             <version>${morphology.version}</version> |             <version>1.2-SNAPSHOT</version> | ||||||
|  |             <scope>test</scope> | ||||||
|         </dependency> |         </dependency> | ||||||
|         <dependency> |         <dependency> | ||||||
|             <groupId>org.apache.lucene.morphology</groupId> |             <groupId>org.apache.lucene.morphology</groupId> | ||||||
|             <artifactId>english</artifactId> |             <artifactId>english</artifactId> | ||||||
|             <version>${morphology.version}</version> |             <version>1.2-SNAPSHOT</version> | ||||||
|         </dependency> |  | ||||||
| 
 |  | ||||||
|         <dependency> |  | ||||||
|             <groupId>junit</groupId> |  | ||||||
|             <artifactId>junit</artifactId> |  | ||||||
|             <version>${junit.version}</version> |  | ||||||
|             <scope>test</scope> |             <scope>test</scope> | ||||||
|         </dependency> |         </dependency> | ||||||
| 
 |  | ||||||
|     </dependencies> |     </dependencies> | ||||||
| 
 |  | ||||||
| </project> | </project> | ||||||
| @@ -0,0 +1,52 @@ | |||||||
|  | /** | ||||||
|  |  * Copyright 2015 Alexander Kuznetsov | ||||||
|  |  * | ||||||
|  |  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |  * you may not use this file except in compliance with the License. | ||||||
|  |  * You may obtain a copy of the License at | ||||||
|  |  * | ||||||
|  |  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  |  * | ||||||
|  |  * Unless required by applicable law or agreed to in writing, software | ||||||
|  |  * distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |  * See the License for the specific language governing permissions and | ||||||
|  |  * limitations under the License. | ||||||
|  |  */ | ||||||
|  | package org.apache.lucene.morphology.context; | ||||||
|  |  | ||||||
|  | import org.apache.lucene.analysis.Analyzer; | ||||||
|  | import org.apache.lucene.analysis.TokenStream; | ||||||
|  | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||||||
|  | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | ||||||
|  |  | ||||||
|  | import java.io.ByteArrayInputStream; | ||||||
|  | import java.io.IOException; | ||||||
|  | import java.io.InputStreamReader; | ||||||
|  | import java.util.ArrayList; | ||||||
|  | import java.util.LinkedList; | ||||||
|  | import java.util.List; | ||||||
|  |  | ||||||
|  | public class CalculateContextItem { | ||||||
|  |  | ||||||
|  |     public List<ContextItem> createContextItems(String text) throws IOException { | ||||||
|  |         Analyzer statAnalyzer = new StatAnalyzer(); | ||||||
|  |         InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год? - и что я жду тебя, где вино".getBytes()), "UTF-8"); | ||||||
|  |  | ||||||
|  |  | ||||||
|  | //        new RussianMorphology(); | ||||||
|  |  | ||||||
|  |         TokenStream tokenStream = statAnalyzer.tokenStream(null, reader); | ||||||
|  |         tokenStream.reset(); | ||||||
|  |  | ||||||
|  |         List<List<String>> listedLink =  new LinkedList<>(); | ||||||
|  |         while (tokenStream.incrementToken()) { | ||||||
|  |             CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); | ||||||
|  |             PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         return null; | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -0,0 +1,80 @@ | |||||||
|  | /** | ||||||
|  |  * Copyright 2015 Alexander Kuznetsov | ||||||
|  |  * | ||||||
|  |  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |  * you may not use this file except in compliance with the License. | ||||||
|  |  * You may obtain a copy of the License at | ||||||
|  |  * | ||||||
|  |  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  |  * | ||||||
|  |  * Unless required by applicable law or agreed to in writing, software | ||||||
|  |  * distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |  * See the License for the specific language governing permissions and | ||||||
|  |  * limitations under the License. | ||||||
|  |  */ | ||||||
|  | package org.apache.lucene.morphology.context; | ||||||
|  |  | ||||||
|  | import java.util.Arrays; | ||||||
|  |  | ||||||
|  | public class ContextItem implements Comparable<ContextItem> { | ||||||
|  |     String[][] morphInfo; | ||||||
|  |  | ||||||
|  |     public ContextItem(String[][] morphInfo) { | ||||||
|  |         this.morphInfo = morphInfo; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     public String[][] getMorphInfo() { | ||||||
|  |         return morphInfo; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     public void setMorphInfo(String[][] morphInfo) { | ||||||
|  |         this.morphInfo = morphInfo; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     public int hashCode() { | ||||||
|  |         int h = 0; | ||||||
|  |         for (String[] m : morphInfo) { | ||||||
|  |             for (String s : m) { | ||||||
|  |                 h = 31 * h + s.hashCode(); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         return h; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     @Override | ||||||
|  |     public boolean equals(Object o) { | ||||||
|  |         if (this == o) return true; | ||||||
|  |         if (o == null || getClass() != o.getClass()) return false; | ||||||
|  |  | ||||||
|  |         ContextItem that = (ContextItem) o; | ||||||
|  |  | ||||||
|  |         if (that.morphInfo.length != this.morphInfo.length) { | ||||||
|  |             return false; | ||||||
|  |         } | ||||||
|  |         for (int i = 0; i < morphInfo.length; i++) { | ||||||
|  |             if (!Arrays.equals(morphInfo[i], that.morphInfo[i])) { | ||||||
|  |                 return false; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         return true; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     @Override | ||||||
|  |     public int compareTo(ContextItem o) { | ||||||
|  |         int i = o.morphInfo.length - morphInfo.length; | ||||||
|  |         if (i != 0) return i; | ||||||
|  |         for (int j = 0; j < morphInfo.length; j++) { | ||||||
|  |             i = o.morphInfo[j].length - morphInfo[j].length; | ||||||
|  |             if (i != 0) return i; | ||||||
|  |             for (int k = 0; k < morphInfo[j].length; k++) { | ||||||
|  |                 i = morphInfo[j][k].compareTo(o.morphInfo[j][k]); | ||||||
|  |                 if (i != 0) return i; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         return 0; | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -0,0 +1,37 @@ | |||||||
|  | /** | ||||||
|  |  * Copyright 2015 Alexander Kuznetsov | ||||||
|  |  * | ||||||
|  |  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |  * you may not use this file except in compliance with the License. | ||||||
|  |  * You may obtain a copy of the License at | ||||||
|  |  * | ||||||
|  |  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  |  * | ||||||
|  |  * Unless required by applicable law or agreed to in writing, software | ||||||
|  |  * distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |  * See the License for the specific language governing permissions and | ||||||
|  |  * limitations under the License. | ||||||
|  |  */ | ||||||
|  | package org.apache.lucene.morphology.context; | ||||||
|  |  | ||||||
|  | public class ContextStats { | ||||||
|  |     String[] morphInfo; | ||||||
|  |     double prob; | ||||||
|  |  | ||||||
|  |     public String[] getMorphInfo() { | ||||||
|  |         return morphInfo; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     public void setMorphInfo(String[] morphInfo) { | ||||||
|  |         this.morphInfo = morphInfo; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     public double getProb() { | ||||||
|  |         return prob; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     public void setProb(double prob) { | ||||||
|  |         this.prob = prob; | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -0,0 +1,21 @@ | |||||||
|  | /** | ||||||
|  |  * Copyright 2015 Alexander Kuznetsov | ||||||
|  |  * | ||||||
|  |  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |  * you may not use this file except in compliance with the License. | ||||||
|  |  * You may obtain a copy of the License at | ||||||
|  |  * | ||||||
|  |  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  |  * | ||||||
|  |  * Unless required by applicable law or agreed to in writing, software | ||||||
|  |  * distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |  * See the License for the specific language governing permissions and | ||||||
|  |  * limitations under the License. | ||||||
|  |  */ | ||||||
|  | package org.apache.lucene.morphology.context; | ||||||
|  |  | ||||||
|  | public class ProbClalucator { | ||||||
|  |  | ||||||
|  |  | ||||||
|  | } | ||||||
| @@ -0,0 +1,116 @@ | |||||||
|  | /** | ||||||
|  |  * Copyright 2015 Alexander Kuznetsov | ||||||
|  |  * | ||||||
|  |  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |  * you may not use this file except in compliance with the License. | ||||||
|  |  * You may obtain a copy of the License at | ||||||
|  |  * | ||||||
|  |  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  |  * | ||||||
|  |  * Unless required by applicable law or agreed to in writing, software | ||||||
|  |  * distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |  * See the License for the specific language governing permissions and | ||||||
|  |  * limitations under the License. | ||||||
|  |  */ | ||||||
|  | package org.apache.lucene.morphology.context; | ||||||
|  |  | ||||||
|  | import org.apache.lucene.analysis.Tokenizer; | ||||||
|  | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||||||
|  | import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl; | ||||||
|  | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | ||||||
|  | import org.apache.lucene.util.AttributeFactory; | ||||||
|  |  | ||||||
|  | import java.io.BufferedReader; | ||||||
|  | import java.io.IOException; | ||||||
|  | import java.util.Arrays; | ||||||
|  | import java.util.HashSet; | ||||||
|  | import java.util.LinkedList; | ||||||
|  | import java.util.Set; | ||||||
|  |  | ||||||
|  | public class SimpleTokenizer extends Tokenizer { | ||||||
|  |  | ||||||
|  |     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); | ||||||
|  |     private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class); | ||||||
|  |     LinkedList<String> terms; | ||||||
|  |  | ||||||
|  |     public final static Set<Character> SEPARATION_LETTERS = new HashSet<>(Arrays.asList(' ', '(', ')', ',', '|', '\t', | ||||||
|  |             '\n', '"', ':', '!', '?', ',', ';', '•')); | ||||||
|  |  | ||||||
|  |     public final static Set<Character> MEANING_CHARS = new HashSet<>(Arrays.asList('(', ')', ',', '|', | ||||||
|  |             '"', ':', '!', '?', ',', ';', '•', '.')); | ||||||
|  |  | ||||||
|  |     public SimpleTokenizer() { | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     public SimpleTokenizer(AttributeFactory factory) { | ||||||
|  |         super(factory); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     @Override | ||||||
|  |     final public boolean incrementToken() throws IOException { | ||||||
|  |         if (terms == null) { | ||||||
|  |             createTeams(); | ||||||
|  |         } | ||||||
|  |         if (terms.size() > 0) { | ||||||
|  |             String str = terms.poll(); | ||||||
|  |             termAtt.setEmpty(); | ||||||
|  |             termAtt.append(str); | ||||||
|  |             posAtt.setPositionIncrement(1); | ||||||
|  |             return true; | ||||||
|  |         } | ||||||
|  |         return false; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     private void createTeams() throws IOException { | ||||||
|  |         terms = new LinkedList<>(); | ||||||
|  |  | ||||||
|  |         BufferedReader br = new BufferedReader(input); | ||||||
|  |         StringBuilder sb = new StringBuilder(); | ||||||
|  |         String s = ""; | ||||||
|  |         while ((s = br.readLine()) != null) { | ||||||
|  |             sb.append(s).append(" "); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         s = sb.toString(); | ||||||
|  |         CharTermAttributeImpl currentTerm = new CharTermAttributeImpl(); | ||||||
|  |         for (int i = 0; i < s.length(); i++) { | ||||||
|  |             if (checkIsCharSepartor(s, i)) { | ||||||
|  |                 if (checkIsCharHasMeaning(s, i)) { | ||||||
|  |                     terms.add(s.substring(i, i + 1)); | ||||||
|  |                 } | ||||||
|  |                 String term = currentTerm.toString(); | ||||||
|  |                 currentTerm.clear(); | ||||||
|  |                 if (term.length() > 0) { | ||||||
|  |                     terms.add(term); | ||||||
|  |                 } | ||||||
|  |             } else { | ||||||
|  |                 currentTerm.append(s.charAt(i)); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     private boolean checkIsCharHasMeaning(String s, int i) { | ||||||
|  |         return MEANING_CHARS.contains(s.charAt(i)); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     private boolean checkIsCharSepartor(String s, int i) { | ||||||
|  |         char c = s.charAt(i); | ||||||
|  |         if (SEPARATION_LETTERS.contains(c)) { | ||||||
|  |             return true; | ||||||
|  |         } | ||||||
|  |         if ('.' == c | ||||||
|  |                 && s.length() > i + 1 | ||||||
|  |                 && SEPARATION_LETTERS.contains(s.charAt(i + 1))) { | ||||||
|  |             return true; | ||||||
|  |         } | ||||||
|  |         return false; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     @Override | ||||||
|  |     public void reset() throws IOException { | ||||||
|  |         this.terms = null; | ||||||
|  |         super.reset(); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  | } | ||||||
| @@ -0,0 +1,34 @@ | |||||||
|  | package org.apache.lucene.morphology.context; | ||||||
|  |  | ||||||
|  | import org.apache.lucene.analysis.Analyzer; | ||||||
|  | import org.apache.lucene.analysis.TokenFilter; | ||||||
|  | import org.apache.lucene.analysis.core.LowerCaseFilter; | ||||||
|  | import org.apache.lucene.analysis.standard.StandardFilter; | ||||||
|  |  | ||||||
|  | import java.io.IOException; | ||||||
|  | import java.io.Reader; | ||||||
|  |  | ||||||
|  | /** | ||||||
|  |  * Created by akuznetsov on 6/24/15. | ||||||
|  |  */ | ||||||
|  | public class StatAnalyzer extends Analyzer { | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     @Override | ||||||
|  |     protected TokenStreamComponents createComponents(String s) { | ||||||
|  |  | ||||||
|  |         SimpleTokenizer src = new SimpleTokenizer(); | ||||||
|  |         TokenFilter filter = new StandardFilter(src); | ||||||
|  |         filter = new LowerCaseFilter(filter); | ||||||
|  |  | ||||||
|  |         return new TokenStreamComponents(src, filter) { | ||||||
|  |             @Override | ||||||
|  |             protected void setReader(final Reader reader) throws IOException { | ||||||
|  |                 super.setReader(reader); | ||||||
|  |             } | ||||||
|  |         }; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | } | ||||||
| @@ -0,0 +1,32 @@ | |||||||
|  | package org.apache.lucene.morphology.context; | ||||||
|  |  | ||||||
|  |  | ||||||
|  | import org.apache.lucene.analysis.Analyzer; | ||||||
|  | import org.apache.lucene.analysis.TokenStream; | ||||||
|  | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||||||
|  | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | ||||||
|  | import org.junit.Test; | ||||||
|  |  | ||||||
|  | import java.io.ByteArrayInputStream; | ||||||
|  | import java.io.IOException; | ||||||
|  | import java.io.InputStreamReader; | ||||||
|  |  | ||||||
|  | public class SimpleTokenizerTest { | ||||||
|  |  | ||||||
|  |     @Test | ||||||
|  |     public void testSimpleTokenizer() throws IOException { | ||||||
|  |         Analyzer statAnalyzer = new StatAnalyzer(); | ||||||
|  |         InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год? - и что я жду тебя, где вино".getBytes()), "UTF-8"); | ||||||
|  |  | ||||||
|  |         TokenStream tokenStream = statAnalyzer.tokenStream(null, reader); | ||||||
|  |         tokenStream.reset(); | ||||||
|  |  | ||||||
|  |         boolean wordSeen = false; | ||||||
|  |         while (tokenStream.incrementToken()) { | ||||||
|  |             CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); | ||||||
|  |             PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class); | ||||||
|  |             System.out.println(charTerm.toString()); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  | } | ||||||
| @@ -3,26 +3,27 @@ | |||||||
|     <parent> |     <parent> | ||||||
|         <artifactId>morphology</artifactId> |         <artifactId>morphology</artifactId> | ||||||
|         <groupId>org.apache.lucene.morphology</groupId> |         <groupId>org.apache.lucene.morphology</groupId> | ||||||
|         <version>1.5</version> |         <version>1.2-SNAPSHOT</version> | ||||||
|     </parent> |     </parent> | ||||||
|     <modelVersion>4.0.0</modelVersion> |     <modelVersion>4.0.0</modelVersion> | ||||||
|  |     <groupId>org.apache.lucene.morphology</groupId> | ||||||
|     <artifactId>dictionary-reader</artifactId> |     <artifactId>dictionary-reader</artifactId> | ||||||
|     <name>dictionary-reader</name> |     <name>dictionary-reader</name> | ||||||
|     <version>1.5</version> |     <version>1.2-SNAPSHOT</version> | ||||||
|     <url>http://maven.apache.org</url> |     <url>http://maven.apache.org</url> | ||||||
|  |  | ||||||
|     <dependencies> |     <dependencies> | ||||||
|         <dependency> |         <dependency> | ||||||
|             <groupId>org.apache.lucene.morphology</groupId> |             <groupId>org.apache.lucene.morphology</groupId> | ||||||
|             <artifactId>russian</artifactId> |             <artifactId>russian</artifactId> | ||||||
|             <version>1.5</version> |             <version>1.2-SNAPSHOT</version> | ||||||
|         </dependency> |         </dependency> | ||||||
|  |  | ||||||
|  |  | ||||||
|         <dependency> |         <dependency> | ||||||
|             <groupId>org.apache.lucene.morphology</groupId> |             <groupId>org.apache.lucene.morphology</groupId> | ||||||
|             <artifactId>english</artifactId> |             <artifactId>english</artifactId> | ||||||
|             <version>1.5</version> |             <version>1.2-SNAPSHOT</version> | ||||||
|         </dependency> |         </dependency> | ||||||
|     </dependencies> |     </dependencies> | ||||||
|  |  | ||||||
|   | |||||||
| @@ -22,19 +22,20 @@ import java.io.FileInputStream; | |||||||
| import java.io.IOException; | import java.io.IOException; | ||||||
| import java.io.InputStreamReader; | import java.io.InputStreamReader; | ||||||
| import java.util.ArrayList; | import java.util.ArrayList; | ||||||
|  | import java.util.HashSet; | ||||||
| import java.util.List; | import java.util.List; | ||||||
| import java.util.Set; | import java.util.Set; | ||||||
|  |  | ||||||
|  |  | ||||||
| /** | /** | ||||||
|  * This class contain logic how read |  * This class contain logic how read | ||||||
|  * dictionary and produce word with it all forms. |  * dictonary and produce word with it all forms. | ||||||
|  */ |  */ | ||||||
| public class DictionaryReader { | public class DictionaryReader { | ||||||
|     private String fileName; |     private String fileName; | ||||||
|     private String fileEncoding = "windows-1251"; |     private String fileEncoding = "windows-1251"; | ||||||
|     private List<List<FlexiaModel>> wordsFlexias = new ArrayList<>(); |     private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>(); | ||||||
|     private Set<String> ignoredForm; |     private Set<String> ignoredForm = new HashSet<String>(); | ||||||
|  |  | ||||||
|     public DictionaryReader(String fileName, Set<String> ignoredForm) { |     public DictionaryReader(String fileName, Set<String> ignoredForm) { | ||||||
|         this.fileName = fileName; |         this.fileName = fileName; | ||||||
| @@ -54,7 +55,7 @@ public class DictionaryReader { | |||||||
|  |  | ||||||
|     private void readWords(BufferedReader reader, WordProcessor wordProcessor) throws IOException { |     private void readWords(BufferedReader reader, WordProcessor wordProcessor) throws IOException { | ||||||
|         String s = reader.readLine(); |         String s = reader.readLine(); | ||||||
|         int count = Integer.parseInt(s); |         int count = Integer.valueOf(s); | ||||||
|         int actual = 0; |         int actual = 0; | ||||||
|         for (int i = 0; i < count; i++) { |         for (int i = 0; i < count; i++) { | ||||||
|             s = reader.readLine(); |             s = reader.readLine(); | ||||||
| @@ -78,7 +79,7 @@ public class DictionaryReader { | |||||||
|         String wordBase = wd[0].toLowerCase(); |         String wordBase = wd[0].toLowerCase(); | ||||||
|         if (wordBase.startsWith("-")) return null; |         if (wordBase.startsWith("-")) return null; | ||||||
|         wordBase = "#".equals(wordBase) ? "" : wordBase; |         wordBase = "#".equals(wordBase) ? "" : wordBase; | ||||||
|         List<FlexiaModel> models = wordsFlexias.get(Integer.parseInt(wd[1])); |         List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1])); | ||||||
|         FlexiaModel flexiaModel = models.get(0); |         FlexiaModel flexiaModel = models.get(0); | ||||||
|         if (models.size() == 0 || ignoredForm.contains(flexiaModel.getCode())) { |         if (models.size() == 0 || ignoredForm.contains(flexiaModel.getCode())) { | ||||||
|             return null; |             return null; | ||||||
| @@ -95,7 +96,7 @@ public class DictionaryReader { | |||||||
|  |  | ||||||
|     private void skipBlock(BufferedReader reader) throws IOException { |     private void skipBlock(BufferedReader reader) throws IOException { | ||||||
|         String s = reader.readLine(); |         String s = reader.readLine(); | ||||||
|         int count = Integer.parseInt(s); |         int count = Integer.valueOf(s); | ||||||
|         for (int i = 0; i < count; i++) { |         for (int i = 0; i < count; i++) { | ||||||
|             reader.readLine(); |             reader.readLine(); | ||||||
|         } |         } | ||||||
| @@ -104,7 +105,7 @@ public class DictionaryReader { | |||||||
|  |  | ||||||
|     private void readPrefix(BufferedReader reader) throws IOException { |     private void readPrefix(BufferedReader reader) throws IOException { | ||||||
|         String s = reader.readLine(); |         String s = reader.readLine(); | ||||||
|         int count = Integer.parseInt(s); |         int count = Integer.valueOf(s); | ||||||
|         for (int i = 0; i < count; i++) { |         for (int i = 0; i < count; i++) { | ||||||
|             reader.readLine(); |             reader.readLine(); | ||||||
|         } |         } | ||||||
| @@ -112,10 +113,10 @@ public class DictionaryReader { | |||||||
|  |  | ||||||
|     private void readFlexias(BufferedReader reader) throws IOException { |     private void readFlexias(BufferedReader reader) throws IOException { | ||||||
|         String s = reader.readLine(); |         String s = reader.readLine(); | ||||||
|         int count = Integer.parseInt(s); |         int count = Integer.valueOf(s); | ||||||
|         for (int i = 0; i < count; i++) { |         for (int i = 0; i < count; i++) { | ||||||
|             s = reader.readLine(); |             s = reader.readLine(); | ||||||
|             ArrayList<FlexiaModel> flexiaModelArrayList = new ArrayList<>(); |             ArrayList<FlexiaModel> flexiaModelArrayList = new ArrayList<FlexiaModel>(); | ||||||
|             wordsFlexias.add(flexiaModelArrayList); |             wordsFlexias.add(flexiaModelArrayList); | ||||||
|             for (String line : s.split("%")) { |             for (String line : s.split("%")) { | ||||||
|                 addFlexia(flexiaModelArrayList, line); |                 addFlexia(flexiaModelArrayList, line); | ||||||
|   | |||||||
| @@ -16,8 +16,6 @@ | |||||||
|  |  | ||||||
| package org.apache.lucene.morphology.dictionary; | package org.apache.lucene.morphology.dictionary; | ||||||
|  |  | ||||||
| import java.util.Objects; |  | ||||||
|  |  | ||||||
| /** | /** | ||||||
|  * Represent information of how word form created form it imutible part. |  * Represent information of how word form created form it imutible part. | ||||||
|  */ |  */ | ||||||
| @@ -76,9 +74,11 @@ public class FlexiaModel { | |||||||
|  |  | ||||||
|         FlexiaModel that = (FlexiaModel) o; |         FlexiaModel that = (FlexiaModel) o; | ||||||
|  |  | ||||||
|         if (!Objects.equals(code, that.code)) return false; |         if (code != null ? !code.equals(that.code) : that.code != null) return false; | ||||||
|         if (!Objects.equals(prefix, that.prefix)) return false; |         if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false; | ||||||
|         return Objects.equals(suffix, that.suffix); |         if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false; | ||||||
|  |  | ||||||
|  |         return true; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     @Override |     @Override | ||||||
|   | |||||||
| @@ -29,8 +29,8 @@ import java.util.Map; | |||||||
| public class GrammarReader { | public class GrammarReader { | ||||||
|     private String fileName; |     private String fileName; | ||||||
|     private String fileEncoding = "windows-1251"; |     private String fileEncoding = "windows-1251"; | ||||||
|     private List<String> grammarInfo = new ArrayList<>(); |     private List<String> grammarInfo = new ArrayList<String>(); | ||||||
|     private Map<String, Integer> inverseIndex = new HashMap<>(); |     private Map<String, Integer> inverseIndex = new HashMap<String, Integer>(); | ||||||
|  |  | ||||||
|     public GrammarReader(String fileName) throws IOException { |     public GrammarReader(String fileName) throws IOException { | ||||||
|         this.fileName = fileName; |         this.fileName = fileName; | ||||||
| @@ -50,7 +50,7 @@ public class GrammarReader { | |||||||
|             line = line.trim(); |             line = line.trim(); | ||||||
|             if (!line.startsWith("//") && line.length() > 0) { |             if (!line.startsWith("//") && line.length() > 0) { | ||||||
|                 String[] strings = line.split(" ", 2); |                 String[] strings = line.split(" ", 2); | ||||||
|                 int i = grammarInfo.size(); |                 Integer i = grammarInfo.size(); | ||||||
|                 inverseIndex.put(strings[0], i); |                 inverseIndex.put(strings[0], i); | ||||||
|                 grammarInfo.add(i, strings[1]); |                 grammarInfo.add(i, strings[1]); | ||||||
|             } |             } | ||||||
| @@ -63,7 +63,7 @@ public class GrammarReader { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     public String[] getGrammarInfoAsArray() { |     public String[] getGrammarInfoAsArray() { | ||||||
|         return grammarInfo.toArray(new String[0]); |         return grammarInfo.toArray(new String[grammarInfo.size()]); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     public Map<String, Integer> getGrammarInverseIndex() { |     public Map<String, Integer> getGrammarInverseIndex() { | ||||||
|   | |||||||
| @@ -15,7 +15,7 @@ | |||||||
|  */ |  */ | ||||||
| package org.apache.lucene.morphology.dictionary; | package org.apache.lucene.morphology.dictionary; | ||||||
|  |  | ||||||
| import java.util.Collections; | import java.util.Arrays; | ||||||
| import java.util.LinkedList; | import java.util.LinkedList; | ||||||
| import java.util.List; | import java.util.List; | ||||||
|  |  | ||||||
| @@ -29,7 +29,7 @@ public class RemoveFlexiaWithPrefixes extends WordFilter { | |||||||
|     @Override |     @Override | ||||||
|     public List<WordCard> transform(WordCard wordCard) { |     public List<WordCard> transform(WordCard wordCard) { | ||||||
|  |  | ||||||
|         List<FlexiaModel> flexiaModelsToRemove = new LinkedList<>(); |         List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>(); | ||||||
|         for (FlexiaModel fm : wordCard.getWordsForms()) { |         for (FlexiaModel fm : wordCard.getWordsForms()) { | ||||||
|             if (fm.getPrefix().length() > 0) { |             if (fm.getPrefix().length() > 0) { | ||||||
|                 flexiaModelsToRemove.add(fm); |                 flexiaModelsToRemove.add(fm); | ||||||
| @@ -39,6 +39,6 @@ public class RemoveFlexiaWithPrefixes extends WordFilter { | |||||||
|             wordCard.removeFlexia(fm); |             wordCard.removeFlexia(fm); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         return new LinkedList<>(Collections.singletonList(wordCard)); |         return new LinkedList<WordCard>(Arrays.asList(wordCard)); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -32,13 +32,13 @@ public class RussianAdvSplitterFilter extends WordFilter { | |||||||
|  |  | ||||||
|     @Override |     @Override | ||||||
|     public List<WordCard> transform(WordCard wordCard) { |     public List<WordCard> transform(WordCard wordCard) { | ||||||
|         LinkedList<WordCard> result = new LinkedList<>(); |         LinkedList<WordCard> result = new LinkedList<WordCard>(); | ||||||
|         result.add(wordCard); |         result.add(wordCard); | ||||||
|  |  | ||||||
|         String baseWord = ""; |         String baseWord = ""; | ||||||
|         String canonicalForm = ""; |         String canonicalForm = ""; | ||||||
|         String canonicalSuffix = ""; |         String canonicalSuffix = ""; | ||||||
|         List<FlexiaModel> flexiaModels = new LinkedList<>(); |         List<FlexiaModel> flexiaModels = new LinkedList<FlexiaModel>(); | ||||||
|         for (FlexiaModel flexiaModel : wordCard.getWordsForms()) { |         for (FlexiaModel flexiaModel : wordCard.getWordsForms()) { | ||||||
|             if (flexiaModel.getPrefix().length() > 0) { |             if (flexiaModel.getPrefix().length() > 0) { | ||||||
|                 flexiaModels.add(new FlexiaModel(flexiaModel.getCode(), flexiaModel.getSuffix(), "")); |                 flexiaModels.add(new FlexiaModel(flexiaModel.getCode(), flexiaModel.getSuffix(), "")); | ||||||
|   | |||||||
| @@ -27,9 +27,9 @@ import java.util.*; | |||||||
|  |  | ||||||
| //todo made refactoring this class | //todo made refactoring this class | ||||||
| public class StatisticsCollector implements WordProcessor { | public class StatisticsCollector implements WordProcessor { | ||||||
|     private TreeMap<String, Set<Heuristic>> inverseIndex = new TreeMap<>(); |     private TreeMap<String, Set<Heuristic>> inverseIndex = new TreeMap<String, Set<Heuristic>>(); | ||||||
|     private Map<Set<Heuristic>, Integer> ruleInverseIndex = new HashMap<>(); |     private Map<Set<Heuristic>, Integer> ruleInverseIndex = new HashMap<Set<Heuristic>, Integer>(); | ||||||
|     private List<Set<Heuristic>> rules = new ArrayList<>(); |     private List<Set<Heuristic>> rules = new ArrayList<Set<Heuristic>>(); | ||||||
|     private GrammarReader grammarReader; |     private GrammarReader grammarReader; | ||||||
|     private LetterDecoderEncoder decoderEncoder; |     private LetterDecoderEncoder decoderEncoder; | ||||||
|  |  | ||||||
| @@ -39,14 +39,18 @@ public class StatisticsCollector implements WordProcessor { | |||||||
|         this.decoderEncoder = decoderEncoder; |         this.decoderEncoder = decoderEncoder; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     public void process(WordCard wordCard) { |     public void process(WordCard wordCard) throws IOException { | ||||||
|         cleanWordCard(wordCard); |         cleanWordCard(wordCard); | ||||||
|         String normalStringMorph = wordCard.getWordsForms().get(0).getCode(); |         String normalStringMorph = wordCard.getWordsForms().get(0).getCode(); | ||||||
|  |  | ||||||
|         for (FlexiaModel fm : wordCard.getWordsForms()) { |         for (FlexiaModel fm : wordCard.getWordsForms()) { | ||||||
|             Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph); |             Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph); | ||||||
|             String form = revertWord(fm.create(wordCard.getBase())); |             String form = revertWord(fm.create(wordCard.getBase())); | ||||||
|             Set<Heuristic> suffixHeuristics = inverseIndex.computeIfAbsent(form, k -> new HashSet<>()); |             Set<Heuristic> suffixHeuristics = inverseIndex.get(form); | ||||||
|  |             if (suffixHeuristics == null) { | ||||||
|  |                 suffixHeuristics = new HashSet<Heuristic>(); | ||||||
|  |                 inverseIndex.put(form, suffixHeuristics); | ||||||
|  |             } | ||||||
|             suffixHeuristics.add(heuristic); |             suffixHeuristics.add(heuristic); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -65,7 +69,7 @@ public class StatisticsCollector implements WordProcessor { | |||||||
|  |  | ||||||
|     public void saveHeuristic(String fileName) throws IOException { |     public void saveHeuristic(String fileName) throws IOException { | ||||||
|  |  | ||||||
|         Map<Integer, Integer> dist = new TreeMap<>(); |         Map<Integer, Integer> dist = new TreeMap<Integer, Integer>(); | ||||||
|         Set<Heuristic> prevSet = null; |         Set<Heuristic> prevSet = null; | ||||||
|         int count = 0; |         int count = 0; | ||||||
|         for (String key : inverseIndex.keySet()) { |         for (String key : inverseIndex.keySet()) { | ||||||
| @@ -116,11 +120,11 @@ public class StatisticsCollector implements WordProcessor { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     private String revertWord(String s) { |     private String revertWord(String s) { | ||||||
|         StringBuilder result = new StringBuilder(); |         String result = ""; | ||||||
|         for (int i = 1; i <= s.length(); i++) { |         for (int i = 1; i <= s.length(); i++) { | ||||||
|             result.append(s.charAt(s.length() - i)); |             result += s.charAt(s.length() - i); | ||||||
|         } |         } | ||||||
|         return result.toString(); |         return result; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -128,15 +132,15 @@ public class StatisticsCollector implements WordProcessor { | |||||||
|         String form = fm.create(wordBase); |         String form = fm.create(wordBase); | ||||||
|         String normalForm = wordBase + canonicalSuffix; |         String normalForm = wordBase + canonicalSuffix; | ||||||
|         Integer length = getCommonLength(form, normalForm); |         Integer length = getCommonLength(form, normalForm); | ||||||
|         int actualSuffixLengh = form.length() - length; |         Integer actualSuffixLengh = form.length() - length; | ||||||
|         String actualNormalSuffix = normalForm.substring(length); |         String actualNormalSuffix = normalForm.substring(length); | ||||||
|         Integer integer = grammarReader.getGrammarInverseIndex().get(fm.getCode()); |         Integer integer = grammarReader.getGrammarInverseIndex().get(fm.getCode()); | ||||||
|         Integer nf = grammarReader.getGrammarInverseIndex().get(normalSuffixForm); |         Integer nf = grammarReader.getGrammarInverseIndex().get(normalSuffixForm); | ||||||
|         return new Heuristic((byte) actualSuffixLengh, actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue()); |         return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue()); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     public static Integer getCommonLength(String s1, String s2) { |     public static Integer getCommonLength(String s1, String s2) { | ||||||
|         int length = Math.min(s1.length(), s2.length()); |         Integer length = Math.min(s1.length(), s2.length()); | ||||||
|         for (int i = 0; i < length; i++) { |         for (int i = 0; i < length; i++) { | ||||||
|             if (s1.charAt(i) != s2.charAt(i)) return i; |             if (s1.charAt(i) != s2.charAt(i)) return i; | ||||||
|         } |         } | ||||||
|   | |||||||
| @@ -26,7 +26,7 @@ public class WordCard { | |||||||
|     private String canonicalForm; |     private String canonicalForm; | ||||||
|     private String base; |     private String base; | ||||||
|     private String canonicalSuffix; |     private String canonicalSuffix; | ||||||
|     private List<FlexiaModel> wordsForms = new ArrayList<>(); |     private List<FlexiaModel> wordsForms = new ArrayList<FlexiaModel>(); | ||||||
|  |  | ||||||
|     public WordCard(String canonicalForm, String base, String canonicalSuffix) { |     public WordCard(String canonicalForm, String base, String canonicalSuffix) { | ||||||
|         this.canonicalForm = canonicalForm; |         this.canonicalForm = canonicalForm; | ||||||
|   | |||||||
| @@ -17,6 +17,7 @@ package org.apache.lucene.morphology.dictionary; | |||||||
|  |  | ||||||
| import org.apache.lucene.morphology.LetterDecoderEncoder; | import org.apache.lucene.morphology.LetterDecoderEncoder; | ||||||
|  |  | ||||||
|  | import java.util.Arrays; | ||||||
| import java.util.Collections; | import java.util.Collections; | ||||||
| import java.util.LinkedList; | import java.util.LinkedList; | ||||||
| import java.util.List; | import java.util.List; | ||||||
| @@ -37,7 +38,7 @@ public class WordCleaner extends WordFilter { | |||||||
|         if (word.contains("-")) return Collections.emptyList(); |         if (word.contains("-")) return Collections.emptyList(); | ||||||
|         if (!decoderEncoder.checkString(word)) return Collections.emptyList(); |         if (!decoderEncoder.checkString(word)) return Collections.emptyList(); | ||||||
|  |  | ||||||
|         List<FlexiaModel> flexiaModelsToRemove = new LinkedList<>(); |         List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>(); | ||||||
|         for (FlexiaModel fm : wordCard.getWordsForms()) { |         for (FlexiaModel fm : wordCard.getWordsForms()) { | ||||||
|             if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) { |             if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) { | ||||||
|                 flexiaModelsToRemove.add(fm); |                 flexiaModelsToRemove.add(fm); | ||||||
| @@ -47,6 +48,6 @@ public class WordCleaner extends WordFilter { | |||||||
|             wordCard.removeFlexia(fm); |             wordCard.removeFlexia(fm); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         return new LinkedList<>(Collections.singletonList(wordCard)); |         return new LinkedList<WordCard>(Arrays.asList(wordCard)); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -23,5 +23,5 @@ import java.io.IOException; | |||||||
|  */ |  */ | ||||||
| public interface WordProcessor { | public interface WordProcessor { | ||||||
|  |  | ||||||
|     void process(WordCard wordCard) throws IOException; |     public void process(WordCard wordCard) throws IOException; | ||||||
| } | } | ||||||
|   | |||||||
| @@ -17,7 +17,7 @@ package org.apache.lucene.morphology.dictionary; | |||||||
|  |  | ||||||
| import org.apache.lucene.morphology.LetterDecoderEncoder; | import org.apache.lucene.morphology.LetterDecoderEncoder; | ||||||
|  |  | ||||||
| import java.util.Collections; | import java.util.Arrays; | ||||||
| import java.util.LinkedList; | import java.util.LinkedList; | ||||||
| import java.util.List; | import java.util.List; | ||||||
|  |  | ||||||
| @@ -42,7 +42,7 @@ public class WordStringCleaner extends WordFilter { | |||||||
|             //made correct code |             //made correct code | ||||||
|             m.setCode(m.getCode().substring(0, 2)); |             m.setCode(m.getCode().substring(0, 2)); | ||||||
|         } |         } | ||||||
|         return new LinkedList<>(Collections.singletonList(wordCard)); |         return new LinkedList<WordCard>(Arrays.asList(wordCard)); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -29,7 +29,7 @@ public class EnglishHeuristicBuilder { | |||||||
|         GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/egramtab.tab"); |         GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/egramtab.tab"); | ||||||
|         EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder(); |         EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder(); | ||||||
|  |  | ||||||
|         DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<>()); |         DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>()); | ||||||
|  |  | ||||||
|         StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder); |         StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder); | ||||||
|         WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector); |         WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector); | ||||||
|   | |||||||
| @@ -28,7 +28,7 @@ public class RussianHeuristicBuilder { | |||||||
|         GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/rgramtab.tab"); |         GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/rgramtab.tab"); | ||||||
|         RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder(); |         RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder(); | ||||||
|  |  | ||||||
|         DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<>()); |         DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>()); | ||||||
|  |  | ||||||
|         StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder); |         StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder); | ||||||
|         WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector); |         WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector); | ||||||
|   | |||||||
| @@ -23,7 +23,6 @@ import org.apache.lucene.morphology.english.EnglishMorphology; | |||||||
| import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder; | import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder; | ||||||
| import org.apache.lucene.morphology.russian.RussianLuceneMorphology; | import org.apache.lucene.morphology.russian.RussianLuceneMorphology; | ||||||
| import org.apache.lucene.morphology.russian.RussianMorphology; | import org.apache.lucene.morphology.russian.RussianMorphology; | ||||||
| import org.hamcrest.MatcherAssert; |  | ||||||
| import org.junit.Before; | import org.junit.Before; | ||||||
| import org.junit.Test; | import org.junit.Test; | ||||||
|  |  | ||||||
| @@ -34,6 +33,7 @@ import java.util.Map; | |||||||
| import java.util.concurrent.atomic.AtomicLong; | import java.util.concurrent.atomic.AtomicLong; | ||||||
|  |  | ||||||
| import static org.hamcrest.Matchers.hasItem; | import static org.hamcrest.Matchers.hasItem; | ||||||
|  | import static org.junit.Assert.assertThat; | ||||||
|  |  | ||||||
|  |  | ||||||
| public class TestAllWords { | public class TestAllWords { | ||||||
| @@ -73,20 +73,22 @@ public class TestAllWords { | |||||||
|         final List<String> morphInfo = grammarInfo.getGrammarInfo(); |         final List<String> morphInfo = grammarInfo.getGrammarInfo(); | ||||||
|         final Map<String, Integer> inversIndex = grammarInfo.getGrammarInverseIndex(); |         final Map<String, Integer> inversIndex = grammarInfo.getGrammarInverseIndex(); | ||||||
|  |  | ||||||
|         DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet<>()); |         DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet<String>()); | ||||||
|  |  | ||||||
|         final AtomicLong wordCount = new AtomicLong(0); |         final AtomicLong wordCount = new AtomicLong(0); | ||||||
|         long startTime = System.currentTimeMillis(); |         Long startTime = System.currentTimeMillis(); | ||||||
|  |  | ||||||
|         WordProcessor wordProcessor = wordCard -> { |         WordProcessor wordProcessor = new WordProcessor() { | ||||||
|  |             public void process(WordCard wordCard) throws IOException { | ||||||
|                 String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); |                 String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); | ||||||
|                 for (FlexiaModel fm : wordCard.getWordsForms()) { |                 for (FlexiaModel fm : wordCard.getWordsForms()) { | ||||||
|                     String wordForm = wordCard.getBase() + fm.getSuffix(); |                     String wordForm = wordCard.getBase() + fm.getSuffix(); | ||||||
|                     String morph = morphInfo.get(inversIndex.get(fm.getCode())); |                     String morph = morphInfo.get(inversIndex.get(fm.getCode())); | ||||||
|                 MatcherAssert.assertThat(morphology.getMorphInfo(wordForm), hasItem(word + "|" + morph)); |                     assertThat(morphology.getMorphInfo(wordForm), hasItem(word + "|" + morph)); | ||||||
|                 MatcherAssert.assertThat(morphology.getNormalForms(wordForm), hasItem(word)); |                     assertThat(morphology.getNormalForms(wordForm), hasItem(word)); | ||||||
|                     wordCount.set(2L + wordCount.get()); |                     wordCount.set(2L + wordCount.get()); | ||||||
|                 } |                 } | ||||||
|  |             } | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         WordCleaner wordCleaner = new WordCleaner(decoderEncoder, wordProcessor); |         WordCleaner wordCleaner = new WordCleaner(decoderEncoder, wordProcessor); | ||||||
| @@ -121,16 +123,18 @@ public class TestAllWords { | |||||||
|  |  | ||||||
|     private void testAllWordForLucene(final LuceneMorphology morphology, LetterDecoderEncoder decoderEncoder, String pathToDic) throws IOException { |     private void testAllWordForLucene(final LuceneMorphology morphology, LetterDecoderEncoder decoderEncoder, String pathToDic) throws IOException { | ||||||
|         final AtomicLong wordCount = new AtomicLong(0); |         final AtomicLong wordCount = new AtomicLong(0); | ||||||
|         long startTime = System.currentTimeMillis(); |         Long startTime = System.currentTimeMillis(); | ||||||
|  |  | ||||||
|         DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<>()); |         DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<String>()); | ||||||
|         WordProcessor wordProcessor = wordCard -> { |         WordProcessor wordProcessor = new WordProcessor() { | ||||||
|  |             public void process(WordCard wordCard) throws IOException { | ||||||
|                 String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); |                 String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); | ||||||
|                 for (FlexiaModel fm : wordCard.getWordsForms()) { |                 for (FlexiaModel fm : wordCard.getWordsForms()) { | ||||||
|                     String wordForm = wordCard.getBase() + fm.getSuffix(); |                     String wordForm = wordCard.getBase() + fm.getSuffix(); | ||||||
|                 MatcherAssert.assertThat(morphology.getNormalForms(wordForm), hasItem(word)); |                     assertThat(morphology.getNormalForms(wordForm), hasItem(word)); | ||||||
|                     wordCount.set(1L + wordCount.get()); |                     wordCount.set(1L + wordCount.get()); | ||||||
|                 } |                 } | ||||||
|  |             } | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         WordCleaner wordCleaner = new WordCleaner(decoderEncoder, wordProcessor); |         WordCleaner wordCleaner = new WordCleaner(decoderEncoder, wordProcessor); | ||||||
|   | |||||||
| @@ -16,12 +16,6 @@ | |||||||
| package org.apache.lucene.morphology; | package org.apache.lucene.morphology; | ||||||
| 
 | 
 | ||||||
| import org.apache.lucene.analysis.Analyzer; | import org.apache.lucene.analysis.Analyzer; | ||||||
| import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; |  | ||||||
| import org.apache.lucene.analysis.CharArraySet; |  | ||||||
| import org.apache.lucene.analysis.LowerCaseFilter; |  | ||||||
| import org.apache.lucene.analysis.TokenFilter; |  | ||||||
| import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; |  | ||||||
| import org.apache.lucene.analysis.standard.StandardTokenizer; |  | ||||||
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||||||
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | ||||||
| import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer; | import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer; | ||||||
| @@ -31,17 +25,16 @@ import org.apache.lucene.analysis.TokenStream; | |||||||
| import org.apache.lucene.morphology.english.EnglishLuceneMorphology; | import org.apache.lucene.morphology.english.EnglishLuceneMorphology; | ||||||
| import org.apache.lucene.morphology.russian.RussianAnalyzer; | import org.apache.lucene.morphology.russian.RussianAnalyzer; | ||||||
| import org.apache.lucene.morphology.russian.RussianLuceneMorphology; | import org.apache.lucene.morphology.russian.RussianLuceneMorphology; | ||||||
| import org.hamcrest.MatcherAssert; |  | ||||||
| import org.junit.Test; | import org.junit.Test; | ||||||
| 
 | 
 | ||||||
| import java.io.*; | import java.io.*; | ||||||
| import java.nio.charset.StandardCharsets; |  | ||||||
| import java.util.*; | import java.util.*; | ||||||
| 
 | 
 | ||||||
| import static org.hamcrest.Matchers.equalTo; | import static org.hamcrest.Matchers.equalTo; | ||||||
|  | import static org.junit.Assert.assertThat; | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| public class TestAnalyzers extends BaseTokenStreamTestCase { | public class AnalyzersTest { | ||||||
| 
 | 
 | ||||||
|     @Test |     @Test | ||||||
|     public void shouldGiveCorrectWordsForEnglish() throws IOException { |     public void shouldGiveCorrectWordsForEnglish() throws IOException { | ||||||
| @@ -67,24 +60,24 @@ public class TestAnalyzers extends BaseTokenStreamTestCase { | |||||||
|         LuceneMorphology englishLuceneMorphology = new EnglishLuceneMorphology(); |         LuceneMorphology englishLuceneMorphology = new EnglishLuceneMorphology(); | ||||||
| 
 | 
 | ||||||
|         MorphologyAnalyzer russianAnalyzer = new MorphologyAnalyzer(russianLuceneMorphology); |         MorphologyAnalyzer russianAnalyzer = new MorphologyAnalyzer(russianLuceneMorphology); | ||||||
|         InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("тест пм тест".getBytes()), StandardCharsets.UTF_8); |         InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("тест пм тест".getBytes()), "UTF-8"); | ||||||
|         TokenStream stream = russianAnalyzer.tokenStream(null, reader); |         TokenStream stream = russianAnalyzer.tokenStream(null, reader); | ||||||
|         MorphologyFilter englishFilter = new MorphologyFilter(stream, englishLuceneMorphology); |         MorphologyFilter englishFilter = new MorphologyFilter(stream, englishLuceneMorphology); | ||||||
| 
 | 
 | ||||||
|         englishFilter.reset(); |         englishFilter.reset(); | ||||||
|         while (englishFilter.incrementToken()) { |         while (englishFilter.incrementToken()) { | ||||||
|             System.out.println(englishFilter); |             System.out.println(englishFilter.toString()); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     @Test |     @Test | ||||||
|     public void shouldProvideCorrectIndentForWordWithMelitaForm() throws IOException { |     public void shouldProvideCorrectIndentForWordWithMelitaForm() throws IOException { | ||||||
|         Analyzer morphlogyAnalyzer = new RussianAnalyzer(); |         Analyzer morphlogyAnalyzer = new RussianAnalyzer(); | ||||||
|         InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), StandardCharsets.UTF_8); |         InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), "UTF-8"); | ||||||
| 
 | 
 | ||||||
|         TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader); |         TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader); | ||||||
|         tokenStream.reset(); |         tokenStream.reset(); | ||||||
|         Set<String> foromsOfWine = new HashSet<>(); |         Set<String> foromsOfWine = new HashSet<String>(); | ||||||
|         foromsOfWine.add("вина"); |         foromsOfWine.add("вина"); | ||||||
|         foromsOfWine.add("винo"); |         foromsOfWine.add("винo"); | ||||||
|         boolean wordSeen = false; |         boolean wordSeen = false; | ||||||
| @@ -92,7 +85,7 @@ public class TestAnalyzers extends BaseTokenStreamTestCase { | |||||||
|             CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); |             CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); | ||||||
|             PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class); |             PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class); | ||||||
|             if(foromsOfWine.contains(charTerm.toString()) && wordSeen){ |             if(foromsOfWine.contains(charTerm.toString()) && wordSeen){ | ||||||
|                 MatcherAssert.assertThat(position.getPositionIncrement(),equalTo(0)); |                 assertThat(position.getPositionIncrement(),equalTo(0)); | ||||||
|             } |             } | ||||||
|             if(foromsOfWine.contains(charTerm.toString())){ |             if(foromsOfWine.contains(charTerm.toString())){ | ||||||
|                 wordSeen = true; |                 wordSeen = true; | ||||||
| @@ -102,18 +95,18 @@ public class TestAnalyzers extends BaseTokenStreamTestCase { | |||||||
| 
 | 
 | ||||||
|     private void testAnalayzer(Analyzer morphlogyAnalyzer, String answerPath, String testPath) throws IOException { |     private void testAnalayzer(Analyzer morphlogyAnalyzer, String answerPath, String testPath) throws IOException { | ||||||
|         InputStream stream = this.getClass().getResourceAsStream(answerPath); |         InputStream stream = this.getClass().getResourceAsStream(answerPath); | ||||||
|         BufferedReader breader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); |         BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); | ||||||
|         String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" "); |         String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" "); | ||||||
|         HashSet<String> answer = new HashSet<>(Arrays.asList(strings)); |         HashSet<String> answer = new HashSet<String>(Arrays.asList(strings)); | ||||||
|         stream.close(); |         stream.close(); | ||||||
| 
 | 
 | ||||||
|         stream = this.getClass().getResourceAsStream(testPath); |         stream = this.getClass().getResourceAsStream(testPath); | ||||||
| 
 | 
 | ||||||
|         InputStreamReader reader = new InputStreamReader(stream, StandardCharsets.UTF_8); |         InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); | ||||||
| 
 | 
 | ||||||
|         TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader); |         TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader); | ||||||
|         tokenStream.reset(); |         tokenStream.reset(); | ||||||
|         HashSet<String> result = new HashSet<>(); |         HashSet<String> result = new HashSet<String>(); | ||||||
|         while (tokenStream.incrementToken()) { |         while (tokenStream.incrementToken()) { | ||||||
|             CharTermAttribute attribute1 = tokenStream.getAttribute(CharTermAttribute.class); |             CharTermAttribute attribute1 = tokenStream.getAttribute(CharTermAttribute.class); | ||||||
|             result.add(attribute1.toString()); |             result.add(attribute1.toString()); | ||||||
| @@ -121,45 +114,6 @@ public class TestAnalyzers extends BaseTokenStreamTestCase { | |||||||
| 
 | 
 | ||||||
|         stream.close(); |         stream.close(); | ||||||
| 
 | 
 | ||||||
|         MatcherAssert.assertThat(result, equalTo(answer)); |         assertThat(result, equalTo(answer)); | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     @Test |  | ||||||
|     public void testPositionIncrement() throws IOException { |  | ||||||
|         EnglishAnalyzer englishAnalyzer = new EnglishAnalyzer(); |  | ||||||
|         assertTokenStreamContents( |  | ||||||
|                 englishAnalyzer.tokenStream("test", "There are tests!"), |  | ||||||
|                 new String[]{"there", "are", "be", "test"}, |  | ||||||
|                 new int[]{0, 6, 6, 10}, |  | ||||||
|                 new int[]{5, 9, 9, 15}, |  | ||||||
|                 new String[]{"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>"}, |  | ||||||
|                 new int[]{1, 1, 0, 1} |  | ||||||
|         ); |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     @Test |  | ||||||
|     public void testKeywordHandling() throws IOException { |  | ||||||
|         Analyzer analyzer = new EnglishKeywordTestAnalyzer(); |  | ||||||
|         assertTokenStreamContents( |  | ||||||
|                 analyzer.tokenStream("test", "Tests shouldn't be stemmed, but tests should!"), |  | ||||||
|                 new String[]{"tests", "shouldn't", "be", "stem", "but", "test", "shall"} |  | ||||||
|         ); |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     private static class EnglishKeywordTestAnalyzer extends Analyzer { |  | ||||||
|         @Override |  | ||||||
|         protected TokenStreamComponents createComponents(String s) { |  | ||||||
|             StandardTokenizer src = new StandardTokenizer(); |  | ||||||
|             CharArraySet dontStem = new CharArraySet(1, false); |  | ||||||
|             dontStem.add("Tests"); |  | ||||||
|             TokenFilter filter = new SetKeywordMarkerFilter(src, dontStem); |  | ||||||
|             filter = new LowerCaseFilter(filter); |  | ||||||
|             try { |  | ||||||
|                 filter = new MorphologyFilter(filter, new EnglishLuceneMorphology()); |  | ||||||
|             } catch (IOException ex) { |  | ||||||
|                 throw new RuntimeException("cannot create EnglishLuceneMorphology", ex); |  | ||||||
|             } |  | ||||||
|             return new TokenStreamComponents(src, filter); |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -17,23 +17,22 @@ package org.apache.lucene.morphology; | |||||||
| 
 | 
 | ||||||
| import org.apache.lucene.morphology.russian.RussianLuceneMorphology; | import org.apache.lucene.morphology.russian.RussianLuceneMorphology; | ||||||
| import org.apache.lucene.morphology.english.EnglishLuceneMorphology; | import org.apache.lucene.morphology.english.EnglishLuceneMorphology; | ||||||
| import org.hamcrest.MatcherAssert; |  | ||||||
| import org.junit.Test; | import org.junit.Test; | ||||||
| 
 | 
 | ||||||
| import java.io.BufferedReader; | import java.io.BufferedReader; | ||||||
| import java.io.IOException; | import java.io.IOException; | ||||||
| import java.io.InputStream; | import java.io.InputStream; | ||||||
| import java.io.InputStreamReader; | import java.io.InputStreamReader; | ||||||
| import java.nio.charset.StandardCharsets; |  | ||||||
| import java.util.Arrays; | import java.util.Arrays; | ||||||
| import java.util.HashSet; | import java.util.HashSet; | ||||||
| import java.util.List; | import java.util.List; | ||||||
| import java.util.Set; | import java.util.Set; | ||||||
| 
 | 
 | ||||||
| import static org.hamcrest.CoreMatchers.equalTo; | import static org.hamcrest.CoreMatchers.equalTo; | ||||||
|  | import static org.junit.Assert.assertThat; | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| public class TestLuceneMorph { | public class LuceneMorphTest { | ||||||
| 
 | 
 | ||||||
|     @Test |     @Test | ||||||
|     public void englishMorphologyShouldGetCorrectNormalForm() throws IOException { |     public void englishMorphologyShouldGetCorrectNormalForm() throws IOException { | ||||||
| @@ -53,13 +52,14 @@ public class TestLuceneMorph { | |||||||
| 
 | 
 | ||||||
|     private void testMorphology(LuceneMorphology luceneMorph, String pathToTestData) throws IOException { |     private void testMorphology(LuceneMorphology luceneMorph, String pathToTestData) throws IOException { | ||||||
|         InputStream stream = this.getClass().getResourceAsStream(pathToTestData); |         InputStream stream = this.getClass().getResourceAsStream(pathToTestData); | ||||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); |         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); | ||||||
|         String s = bufferedReader.readLine(); |         String s = bufferedReader.readLine(); | ||||||
|         while (s != null) { |         while (s != null) { | ||||||
|             String[] qa = s.trim().split(" "); |             String[] qa = s.trim().split(" "); | ||||||
|             Set<String> result = new HashSet<>(Arrays.asList(qa).subList(1, qa.length)); |             Set<String> result = new HashSet<String>(); | ||||||
|             Set<String> stringList = new HashSet<>(luceneMorph.getNormalForms(qa[0])); |             result.addAll(Arrays.asList(qa).subList(1, qa.length)); | ||||||
|             MatcherAssert.assertThat(stringList, equalTo(result)); |             Set<String> stringList = new HashSet<String>(luceneMorph.getNormalForms(qa[0])); | ||||||
|  |             assertThat(stringList, equalTo(result)); | ||||||
|             s = bufferedReader.readLine(); |             s = bufferedReader.readLine(); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -3,20 +3,27 @@ | |||||||
|     <parent> |     <parent> | ||||||
|         <artifactId>morphology</artifactId> |         <artifactId>morphology</artifactId> | ||||||
|         <groupId>org.apache.lucene.morphology</groupId> |         <groupId>org.apache.lucene.morphology</groupId> | ||||||
|         <version>1.5</version> |         <version>1.2-SNAPSHOT</version> | ||||||
|     </parent> |     </parent> | ||||||
|     <modelVersion>4.0.0</modelVersion> |     <modelVersion>4.0.0</modelVersion> | ||||||
|  |     <groupId>org.apache.lucene.morphology</groupId> | ||||||
|     <artifactId>english</artifactId> |     <artifactId>english</artifactId> | ||||||
|     <name>english</name> |     <name>english</name> | ||||||
|     <version>1.5</version> |     <version>1.2-SNAPSHOT</version> | ||||||
|     <url>http://maven.apache.org</url> |     <url>http://maven.apache.org</url> | ||||||
|     <dependencies> |     <dependencies> | ||||||
|  |  | ||||||
|         <dependency> |         <dependency> | ||||||
|             <groupId>org.apache.lucene.morphology</groupId> |             <groupId>org.apache.lucene.morphology</groupId> | ||||||
|             <artifactId>morph</artifactId> |             <artifactId>morph</artifactId> | ||||||
|             <version>1.5</version> |             <version>1.2-SNAPSHOT</version> | ||||||
|         </dependency> |         </dependency> | ||||||
|  |  | ||||||
|  |         <dependency> | ||||||
|  |             <groupId>junit</groupId> | ||||||
|  |             <artifactId>junit</artifactId> | ||||||
|  |             <version>4.8.2</version> | ||||||
|  |             <scope>test</scope> | ||||||
|  |         </dependency> | ||||||
|     </dependencies> |     </dependencies> | ||||||
| </project> | </project> | ||||||
| @@ -32,7 +32,7 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder { | |||||||
|         if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12); |         if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12); | ||||||
|         int result = 0; |         int result = 0; | ||||||
|         for (int i = 0; i < string.length(); i++) { |         for (int i = 0; i < string.length(); i++) { | ||||||
|             int c = string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET; |             int c = 0 + string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET; | ||||||
|             if (c == 45 - ENGLISH_SMALL_LETTER_OFFSET) { |             if (c == 45 - ENGLISH_SMALL_LETTER_OFFSET) { | ||||||
|                 c = DASH_CODE; |                 c = DASH_CODE; | ||||||
|             } |             } | ||||||
| @@ -48,7 +48,7 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder { | |||||||
|  |  | ||||||
|     public int[] encodeToArray(String s) { |     public int[] encodeToArray(String s) { | ||||||
|  |  | ||||||
|         ArrayList<Integer> integers = new ArrayList<>(); |         ArrayList<Integer> integers = new ArrayList<Integer>(); | ||||||
|         while (s.length() > 6) { |         while (s.length() > 6) { | ||||||
|             integers.add(encode(s.substring(0, 6))); |             integers.add(encode(s.substring(0, 6))); | ||||||
|             s = s.substring(6); |             s = s.substring(6); | ||||||
| @@ -64,16 +64,16 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     public String decodeArray(int[] array) { |     public String decodeArray(int[] array) { | ||||||
|         StringBuilder result = new StringBuilder(); |         String result = ""; | ||||||
|         for (int i : array) { |         for (int i : array) { | ||||||
|             result.append(decode(i)); |             result += decode(i); | ||||||
|         } |         } | ||||||
|         return result.toString(); |         return result; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|     public String decode(Integer suffixN) { |     public String decode(Integer suffixN) { | ||||||
|         StringBuilder result = new StringBuilder(); |         String result = ""; | ||||||
|         while (suffixN > 27) { |         while (suffixN > 27) { | ||||||
|             int c = suffixN % 28 + ENGLISH_SMALL_LETTER_OFFSET; |             int c = suffixN % 28 + ENGLISH_SMALL_LETTER_OFFSET; | ||||||
|             if (c == ENGLISH_SMALL_LETTER_OFFSET) { |             if (c == ENGLISH_SMALL_LETTER_OFFSET) { | ||||||
| @@ -81,20 +81,21 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder { | |||||||
|                 continue; |                 continue; | ||||||
|             } |             } | ||||||
|             if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR; |             if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR; | ||||||
|             result.insert(0, (char) c); |             result = (char) c + result; | ||||||
|             suffixN /= 28; |             suffixN /= 28; | ||||||
|         } |         } | ||||||
|         long c = suffixN + ENGLISH_SMALL_LETTER_OFFSET; |         long c = suffixN + ENGLISH_SMALL_LETTER_OFFSET; | ||||||
|         if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR; |         if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR; | ||||||
|         result.insert(0, (char) c); |         result = (char) c + result; | ||||||
|         return result.toString(); |         return result; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     public boolean checkCharacter(char c) { |     public boolean checkCharacter(char c) { | ||||||
|         int code = c; |         int code = 0 + c; | ||||||
|         if (code == 45) return true; |         if (code == 45) return true; | ||||||
|         code -= ENGLISH_SMALL_LETTER_OFFSET; |         code -= ENGLISH_SMALL_LETTER_OFFSET; | ||||||
|         return code > 0 && code < 27; |         if (code > 0 && code < 27) return true; | ||||||
|  |         return false; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -16,8 +16,7 @@ | |||||||
| package org.apache.lucene.morphology.english; | package org.apache.lucene.morphology.english; | ||||||
|  |  | ||||||
| import static org.hamcrest.core.IsEqual.equalTo; | import static org.hamcrest.core.IsEqual.equalTo; | ||||||
|  | import static org.junit.Assert.assertThat; | ||||||
| import org.hamcrest.MatcherAssert; |  | ||||||
| import org.junit.Before; | import org.junit.Before; | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -31,11 +30,11 @@ public class EnglishLetterDecoderEncoderTest { | |||||||
|  |  | ||||||
|     @org.junit.Test |     @org.junit.Test | ||||||
|     public void testDecodeEncodeToArray() { |     public void testDecodeEncodeToArray() { | ||||||
|         MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("abcdefghijklmnopqrstuvwxyz")), equalTo("abcdefghijklmnopqrstuvwxyz")); |         assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("abcdefghijklmnopqrstuvwxyz")), equalTo("abcdefghijklmnopqrstuvwxyz")); | ||||||
|         MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("xyz")), equalTo("xyz")); |         assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("xyz")), equalTo("xyz")); | ||||||
|         MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrty")), equalTo("ytrrty")); |         assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrty")), equalTo("ytrrty")); | ||||||
|         MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyz")), equalTo("ytrrtyz")); |         assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyz")), equalTo("ytrrtyz")); | ||||||
|         MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyzqwqwe")), equalTo("ytrrtyzqwqwe")); |         assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyzqwqwe")), equalTo("ytrrtyzqwqwe")); | ||||||
|  |  | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -16,9 +16,9 @@ | |||||||
| package org.apache.lucene.morphology.english.stemmer; | package org.apache.lucene.morphology.english.stemmer; | ||||||
|  |  | ||||||
| import org.apache.lucene.morphology.english.EnglishLuceneMorphology; | import org.apache.lucene.morphology.english.EnglishLuceneMorphology; | ||||||
| import org.hamcrest.MatcherAssert; |  | ||||||
| import org.junit.Test; | import org.junit.Test; | ||||||
| import static org.hamcrest.core.IsEqual.equalTo; | import static org.hamcrest.core.IsEqual.equalTo; | ||||||
|  | import static org.junit.Assert.assertThat; | ||||||
|  |  | ||||||
|  |  | ||||||
| public class EnglishStemmerTest { | public class EnglishStemmerTest { | ||||||
| @@ -26,24 +26,24 @@ public class EnglishStemmerTest { | |||||||
|     public void testGetStemmedWord() throws Exception { |     public void testGetStemmedWord() throws Exception { | ||||||
|         EnglishLuceneMorphology englishLuceneMorphology = new EnglishLuceneMorphology(); |         EnglishLuceneMorphology englishLuceneMorphology = new EnglishLuceneMorphology(); | ||||||
|         EnglishStemmer englishStemmer = new EnglishStemmer(englishLuceneMorphology); |         EnglishStemmer englishStemmer = new EnglishStemmer(englishLuceneMorphology); | ||||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("running"),equalTo("run")); |         assertThat(englishStemmer.getStemmedWord("running"),equalTo("run")); | ||||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("run"),equalTo("run")); |         assertThat(englishStemmer.getStemmedWord("run"),equalTo("run")); | ||||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("killed"),equalTo("kill")); |         assertThat(englishStemmer.getStemmedWord("killed"),equalTo("kill")); | ||||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("kill"),equalTo("kill")); |         assertThat(englishStemmer.getStemmedWord("kill"),equalTo("kill")); | ||||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("networking"),equalTo("network")); |         assertThat(englishStemmer.getStemmedWord("networking"),equalTo("network")); | ||||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("network"),equalTo("network")); |         assertThat(englishStemmer.getStemmedWord("network"),equalTo("network")); | ||||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("statistics"),equalTo("statistic")); |         assertThat(englishStemmer.getStemmedWord("statistics"),equalTo("statistic")); | ||||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("statistic"),equalTo("statistic")); |         assertThat(englishStemmer.getStemmedWord("statistic"),equalTo("statistic")); | ||||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("stats"),equalTo("stat")); |         assertThat(englishStemmer.getStemmedWord("stats"),equalTo("stat")); | ||||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("stat"),equalTo("stat")); |         assertThat(englishStemmer.getStemmedWord("stat"),equalTo("stat")); | ||||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("countries"),equalTo("country")); |         assertThat(englishStemmer.getStemmedWord("countries"),equalTo("country")); | ||||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("country"),equalTo("country")); |         assertThat(englishStemmer.getStemmedWord("country"),equalTo("country")); | ||||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("delete"),equalTo("delete")); |         assertThat(englishStemmer.getStemmedWord("delete"),equalTo("delete")); | ||||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("ended"),equalTo("end")); |         assertThat(englishStemmer.getStemmedWord("ended"),equalTo("end")); | ||||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("end"),equalTo("end")); |         assertThat(englishStemmer.getStemmedWord("end"),equalTo("end")); | ||||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("ends"),equalTo("end")); |         assertThat(englishStemmer.getStemmedWord("ends"),equalTo("end")); | ||||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("given"),equalTo("give")); |         assertThat(englishStemmer.getStemmedWord("given"),equalTo("give")); | ||||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("give"),equalTo("give")); |         assertThat(englishStemmer.getStemmedWord("give"),equalTo("give")); | ||||||
|         MatcherAssert.assertThat(englishStemmer.getStemmedWord("log4j"),equalTo("log4j")); |         assertThat(englishStemmer.getStemmedWord("log4j"),equalTo("log4j")); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -1,4 +1,4 @@ | |||||||
| Copyright 2009 Alexander Kuznetsov  | Copyright ${project.inceptionYear} ${owner} | ||||||
|  |  | ||||||
| Licensed under the Apache License, Version 2.0 (the "License"); | Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
| you may not use this file except in compliance with the License. | you may not use this file except in compliance with the License. | ||||||
|   | |||||||
| @@ -3,12 +3,13 @@ | |||||||
|     <parent> |     <parent> | ||||||
|         <artifactId>morphology</artifactId> |         <artifactId>morphology</artifactId> | ||||||
|         <groupId>org.apache.lucene.morphology</groupId> |         <groupId>org.apache.lucene.morphology</groupId> | ||||||
|         <version>1.5</version> |         <version>1.2-SNAPSHOT</version> | ||||||
|     </parent> |     </parent> | ||||||
|     <modelVersion>4.0.0</modelVersion> |     <modelVersion>4.0.0</modelVersion> | ||||||
|  |     <groupId>org.apache.lucene.morphology</groupId> | ||||||
|     <artifactId>morph</artifactId> |     <artifactId>morph</artifactId> | ||||||
|     <name>morph</name> |     <name>morph</name> | ||||||
|     <version>1.5</version> |     <version>1.2-SNAPSHOT</version> | ||||||
|     <url>http://maven.apache.org</url> |     <url>http://maven.apache.org</url> | ||||||
|  |  | ||||||
| </project> | </project> | ||||||
|   | |||||||
| @@ -21,7 +21,7 @@ import java.util.ArrayList; | |||||||
|  |  | ||||||
| public abstract class BaseLetterDecoderEncoder implements LetterDecoderEncoder { | public abstract class BaseLetterDecoderEncoder implements LetterDecoderEncoder { | ||||||
|     public int[] encodeToArray(String s) { |     public int[] encodeToArray(String s) { | ||||||
|         ArrayList<Integer> integers = new ArrayList<>(); |         ArrayList<Integer> integers = new ArrayList<Integer>(); | ||||||
|         while (s.length() > 6) { |         while (s.length() > 6) { | ||||||
|             integers.add(encode(s.substring(0, 6))); |             integers.add(encode(s.substring(0, 6))); | ||||||
|             s = s.substring(6); |             s = s.substring(6); | ||||||
| @@ -37,11 +37,11 @@ public abstract class BaseLetterDecoderEncoder implements LetterDecoderEncoder { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     public String decodeArray(int[] array) { |     public String decodeArray(int[] array) { | ||||||
|         StringBuilder result = new StringBuilder(); |         String result = ""; | ||||||
|         for (int i : array) { |         for (int i : array) { | ||||||
|             result.append(decode(i)); |             result += decode(i); | ||||||
|         } |         } | ||||||
|         return result.toString(); |         return result; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     public boolean checkString(String word) { |     public boolean checkString(String word) { | ||||||
|   | |||||||
| @@ -16,7 +16,6 @@ | |||||||
| package org.apache.lucene.morphology; | package org.apache.lucene.morphology; | ||||||
|  |  | ||||||
| import java.io.Serializable; | import java.io.Serializable; | ||||||
| import java.util.Objects; |  | ||||||
|  |  | ||||||
|  |  | ||||||
| public class Heuristic implements Serializable { | public class Heuristic implements Serializable { | ||||||
| @@ -27,10 +26,10 @@ public class Heuristic implements Serializable { | |||||||
|  |  | ||||||
|     public Heuristic(String s) { |     public Heuristic(String s) { | ||||||
|         String[] strings = s.split("\\|"); |         String[] strings = s.split("\\|"); | ||||||
|         actualSuffixLength = Byte.parseByte(strings[0]); |         actualSuffixLength = Byte.valueOf(strings[0]); | ||||||
|         actualNormalSuffix = strings[1]; |         actualNormalSuffix = strings[1]; | ||||||
|         formMorphInfo = Short.parseShort(strings[2]); |         formMorphInfo = Short.valueOf(strings[2]); | ||||||
|         normalFormMorphInfo = Short.parseShort(strings[3]); |         normalFormMorphInfo = Short.valueOf(strings[3]); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     public Heuristic(byte actualSuffixLength, String actualNormalSuffix, short formMorphInfo, short normalFormMorphInfo) { |     public Heuristic(byte actualSuffixLength, String actualNormalSuffix, short formMorphInfo, short normalFormMorphInfo) { | ||||||
| @@ -71,12 +70,15 @@ public class Heuristic implements Serializable { | |||||||
|         if (actualSuffixLength != heuristic.actualSuffixLength) return false; |         if (actualSuffixLength != heuristic.actualSuffixLength) return false; | ||||||
|         if (formMorphInfo != heuristic.formMorphInfo) return false; |         if (formMorphInfo != heuristic.formMorphInfo) return false; | ||||||
|         if (normalFormMorphInfo != heuristic.normalFormMorphInfo) return false; |         if (normalFormMorphInfo != heuristic.normalFormMorphInfo) return false; | ||||||
|         return Objects.equals(actualNormalSuffix, heuristic.actualNormalSuffix); |         if (actualNormalSuffix != null ? !actualNormalSuffix.equals(heuristic.actualNormalSuffix) : heuristic.actualNormalSuffix != null) | ||||||
|  |             return false; | ||||||
|  |  | ||||||
|  |         return true; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     @Override |     @Override | ||||||
|     public int hashCode() { |     public int hashCode() { | ||||||
|         int result = actualSuffixLength; |         int result = (int) actualSuffixLength; | ||||||
|         result = 31 * result + (actualNormalSuffix != null ? actualNormalSuffix.hashCode() : 0); |         result = 31 * result + (actualNormalSuffix != null ? actualNormalSuffix.hashCode() : 0); | ||||||
|         result = 31 * result + (int) formMorphInfo; |         result = 31 * result + (int) formMorphInfo; | ||||||
|         result = 31 * result + (int) normalFormMorphInfo; |         result = 31 * result + (int) normalFormMorphInfo; | ||||||
|   | |||||||
| @@ -17,17 +17,17 @@ package org.apache.lucene.morphology; | |||||||
|  |  | ||||||
|  |  | ||||||
| public interface LetterDecoderEncoder { | public interface LetterDecoderEncoder { | ||||||
|     Integer encode(String string); |     public Integer encode(String string); | ||||||
|  |  | ||||||
|     int[] encodeToArray(String s); |     public int[] encodeToArray(String s); | ||||||
|  |  | ||||||
|     String decodeArray(int[] array); |     public String decodeArray(int[] array); | ||||||
|  |  | ||||||
|     String decode(Integer suffixN); |     public String decode(Integer suffixN); | ||||||
|  |  | ||||||
|     boolean checkCharacter(char c); |     public boolean checkCharacter(char c); | ||||||
|  |  | ||||||
|     boolean checkString(String word); |     public boolean checkString(String word); | ||||||
|  |  | ||||||
|     String cleanString(String s); |     public String cleanString(String s); | ||||||
| } | } | ||||||
|   | |||||||
| @@ -34,13 +34,13 @@ public class LuceneMorphology extends MorphologyImpl { | |||||||
|  |  | ||||||
|     protected void readRules(BufferedReader bufferedReader) throws IOException { |     protected void readRules(BufferedReader bufferedReader) throws IOException { | ||||||
|         String s; |         String s; | ||||||
|         int amount; |         Integer amount; | ||||||
|         s = bufferedReader.readLine(); |         s = bufferedReader.readLine(); | ||||||
|         amount = Integer.parseInt(s); |         amount = Integer.valueOf(s); | ||||||
|         rules = new Heuristic[amount][]; |         rules = new Heuristic[amount][]; | ||||||
|         for (int i = 0; i < amount; i++) { |         for (int i = 0; i < amount; i++) { | ||||||
|             String s1 = bufferedReader.readLine(); |             String s1 = bufferedReader.readLine(); | ||||||
|             int ruleLenght = Integer.parseInt(s1); |             Integer ruleLenght = Integer.valueOf(s1); | ||||||
|             Heuristic[] heuristics = new Heuristic[ruleLenght]; |             Heuristic[] heuristics = new Heuristic[ruleLenght]; | ||||||
|             for (int j = 0; j < ruleLenght; j++) { |             for (int j = 0; j < ruleLenght; j++) { | ||||||
|                 heuristics[j] = new Heuristic(bufferedReader.readLine()); |                 heuristics[j] = new Heuristic(bufferedReader.readLine()); | ||||||
| @@ -51,7 +51,7 @@ public class LuceneMorphology extends MorphologyImpl { | |||||||
|  |  | ||||||
|  |  | ||||||
|     private Heuristic[] modeifyHeuristic(Heuristic[] heuristics) { |     private Heuristic[] modeifyHeuristic(Heuristic[] heuristics) { | ||||||
|         ArrayList<Heuristic> result = new ArrayList<>(); |         ArrayList<Heuristic> result = new ArrayList<Heuristic>(); | ||||||
|         for (Heuristic heuristic : heuristics) { |         for (Heuristic heuristic : heuristics) { | ||||||
|             boolean isAdded = true; |             boolean isAdded = true; | ||||||
|             for (Heuristic ch : result) { |             for (Heuristic ch : result) { | ||||||
| @@ -61,7 +61,7 @@ public class LuceneMorphology extends MorphologyImpl { | |||||||
|                 result.add(heuristic); |                 result.add(heuristic); | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|         return result.toArray(new Heuristic[0]); |         return result.toArray(new Heuristic[result.size()]); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     public boolean checkString(String s) { |     public boolean checkString(String s) { | ||||||
|   | |||||||
| @@ -17,7 +17,6 @@ package org.apache.lucene.morphology; | |||||||
|  |  | ||||||
|  |  | ||||||
| import java.io.*; | import java.io.*; | ||||||
| import java.nio.charset.StandardCharsets; |  | ||||||
| import java.util.ArrayList; | import java.util.ArrayList; | ||||||
| import java.util.List; | import java.util.List; | ||||||
|  |  | ||||||
| @@ -48,7 +47,7 @@ public class MorphologyImpl implements Morphology { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     public List<String> getNormalForms(String s) { |     public List<String> getNormalForms(String s) { | ||||||
|         ArrayList<String> result = new ArrayList<>(); |         ArrayList<String> result = new ArrayList<String>(); | ||||||
|         int[] ints = decoderEncoder.encodeToArray(revertWord(s)); |         int[] ints = decoderEncoder.encodeToArray(revertWord(s)); | ||||||
|         int ruleId = findRuleId(ints); |         int ruleId = findRuleId(ints); | ||||||
|         boolean notSeenEmptyString = true; |         boolean notSeenEmptyString = true; | ||||||
| @@ -65,7 +64,7 @@ public class MorphologyImpl implements Morphology { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     public List<String> getMorphInfo(String s) { |     public List<String> getMorphInfo(String s) { | ||||||
|         ArrayList<String> result = new ArrayList<>(); |         ArrayList<String> result = new ArrayList<String>(); | ||||||
|         int[] ints = decoderEncoder.encodeToArray(revertWord(s)); |         int[] ints = decoderEncoder.encodeToArray(revertWord(s)); | ||||||
|         int ruleId = findRuleId(ints); |         int ruleId = findRuleId(ints); | ||||||
|         for (Heuristic h : rules[rulesId[ruleId]]) { |         for (Heuristic h : rules[rulesId[ruleId]]) { | ||||||
| @@ -101,14 +100,14 @@ public class MorphologyImpl implements Morphology { | |||||||
|     private int compareToInts(int[] i1, int[] i2) { |     private int compareToInts(int[] i1, int[] i2) { | ||||||
|         int minLength = Math.min(i1.length, i2.length); |         int minLength = Math.min(i1.length, i2.length); | ||||||
|         for (int i = 0; i < minLength; i++) { |         for (int i = 0; i < minLength; i++) { | ||||||
|             int i3 = Integer.compare(i1[i], i2[i]); |             int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1); | ||||||
|             if (i3 != 0) return i3; |             if (i3 != 0) return i3; | ||||||
|         } |         } | ||||||
|         return i1.length - i2.length; |         return i1.length - i2.length; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     public void writeToFile(String fileName) throws IOException { |     public void writeToFile(String fileName) throws IOException { | ||||||
|         OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8); |         OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8"); | ||||||
|         writer.write(separators.length + "\n"); |         writer.write(separators.length + "\n"); | ||||||
|         for (int[] i : separators) { |         for (int[] i : separators) { | ||||||
|             writer.write(i.length + "\n"); |             writer.write(i.length + "\n"); | ||||||
| @@ -139,7 +138,7 @@ public class MorphologyImpl implements Morphology { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     private void readFromInputStream(InputStream inputStream) throws IOException { |     private void readFromInputStream(InputStream inputStream) throws IOException { | ||||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8)); |         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")); | ||||||
|         String s = bufferedReader.readLine(); |         String s = bufferedReader.readLine(); | ||||||
|         Integer amount = Integer.valueOf(s); |         Integer amount = Integer.valueOf(s); | ||||||
|  |  | ||||||
| @@ -154,9 +153,9 @@ public class MorphologyImpl implements Morphology { | |||||||
|  |  | ||||||
|     private void readGrammaInfo(BufferedReader bufferedReader) throws IOException { |     private void readGrammaInfo(BufferedReader bufferedReader) throws IOException { | ||||||
|         String s; |         String s; | ||||||
|         int amount; |         Integer amount; | ||||||
|         s = bufferedReader.readLine(); |         s = bufferedReader.readLine(); | ||||||
|         amount = Integer.parseInt(s); |         amount = Integer.valueOf(s); | ||||||
|         grammarInfo = new String[amount]; |         grammarInfo = new String[amount]; | ||||||
|         for (int i = 0; i < amount; i++) { |         for (int i = 0; i < amount; i++) { | ||||||
|             grammarInfo[i] = bufferedReader.readLine(); |             grammarInfo[i] = bufferedReader.readLine(); | ||||||
| @@ -165,13 +164,13 @@ public class MorphologyImpl implements Morphology { | |||||||
|  |  | ||||||
|     protected void readRules(BufferedReader bufferedReader) throws IOException { |     protected void readRules(BufferedReader bufferedReader) throws IOException { | ||||||
|         String s; |         String s; | ||||||
|         int amount; |         Integer amount; | ||||||
|         s = bufferedReader.readLine(); |         s = bufferedReader.readLine(); | ||||||
|         amount = Integer.parseInt(s); |         amount = Integer.valueOf(s); | ||||||
|         rules = new Heuristic[amount][]; |         rules = new Heuristic[amount][]; | ||||||
|         for (int i = 0; i < amount; i++) { |         for (int i = 0; i < amount; i++) { | ||||||
|             String s1 = bufferedReader.readLine(); |             String s1 = bufferedReader.readLine(); | ||||||
|             int ruleLength = Integer.parseInt(s1); |             Integer ruleLength = Integer.valueOf(s1); | ||||||
|             rules[i] = new Heuristic[ruleLength]; |             rules[i] = new Heuristic[ruleLength]; | ||||||
|             for (int j = 0; j < ruleLength; j++) { |             for (int j = 0; j < ruleLength; j++) { | ||||||
|                 rules[i][j] = new Heuristic(bufferedReader.readLine()); |                 rules[i][j] = new Heuristic(bufferedReader.readLine()); | ||||||
| @@ -183,7 +182,7 @@ public class MorphologyImpl implements Morphology { | |||||||
|         rulesId = new short[amount]; |         rulesId = new short[amount]; | ||||||
|         for (int i = 0; i < amount; i++) { |         for (int i = 0; i < amount; i++) { | ||||||
|             String s1 = bufferedReader.readLine(); |             String s1 = bufferedReader.readLine(); | ||||||
|             rulesId[i] = Short.parseShort(s1); |             rulesId[i] = Short.valueOf(s1); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -191,10 +190,10 @@ public class MorphologyImpl implements Morphology { | |||||||
|         separators = new int[amount][]; |         separators = new int[amount][]; | ||||||
|         for (int i = 0; i < amount; i++) { |         for (int i = 0; i < amount; i++) { | ||||||
|             String s1 = bufferedReader.readLine(); |             String s1 = bufferedReader.readLine(); | ||||||
|             int wordLenght = Integer.parseInt(s1); |             Integer wordLenght = Integer.valueOf(s1); | ||||||
|             separators[i] = new int[wordLenght]; |             separators[i] = new int[wordLenght]; | ||||||
|             for (int j = 0; j < wordLenght; j++) { |             for (int j = 0; j < wordLenght; j++) { | ||||||
|                 separators[i][j] = Integer.parseInt(bufferedReader.readLine()); |                 separators[i][j] = Integer.valueOf(bufferedReader.readLine()); | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -17,10 +17,11 @@ | |||||||
| package org.apache.lucene.morphology.analyzer; | package org.apache.lucene.morphology.analyzer; | ||||||
|  |  | ||||||
| import org.apache.lucene.analysis.Analyzer; | import org.apache.lucene.analysis.Analyzer; | ||||||
| import org.apache.lucene.analysis.LowerCaseFilter; |  | ||||||
| import org.apache.lucene.analysis.TokenFilter; | import org.apache.lucene.analysis.TokenFilter; | ||||||
|  | import org.apache.lucene.analysis.core.LowerCaseFilter; | ||||||
| import org.apache.lucene.analysis.payloads.PayloadEncoder; | import org.apache.lucene.analysis.payloads.PayloadEncoder; | ||||||
| import org.apache.lucene.analysis.payloads.PayloadHelper; | import org.apache.lucene.analysis.payloads.PayloadHelper; | ||||||
|  | import org.apache.lucene.analysis.standard.StandardFilter; | ||||||
| import org.apache.lucene.analysis.standard.StandardTokenizer; | import org.apache.lucene.analysis.standard.StandardTokenizer; | ||||||
| import org.apache.lucene.morphology.LetterDecoderEncoder; | import org.apache.lucene.morphology.LetterDecoderEncoder; | ||||||
| import org.apache.lucene.morphology.LuceneMorphology; | import org.apache.lucene.morphology.LuceneMorphology; | ||||||
| @@ -28,7 +29,7 @@ import org.apache.lucene.util.BytesRef; | |||||||
|  |  | ||||||
| import java.io.IOException; | import java.io.IOException; | ||||||
| import java.io.InputStream; | import java.io.InputStream; | ||||||
|  | import java.io.Reader; | ||||||
|  |  | ||||||
| public class MorphologyAnalyzer extends Analyzer { | public class MorphologyAnalyzer extends Analyzer { | ||||||
|     private LuceneMorphology luceneMorph; |     private LuceneMorphology luceneMorph; | ||||||
| @@ -50,29 +51,17 @@ public class MorphologyAnalyzer extends Analyzer { | |||||||
|     protected TokenStreamComponents createComponents(String s) { |     protected TokenStreamComponents createComponents(String s) { | ||||||
|  |  | ||||||
|         StandardTokenizer src = new StandardTokenizer(); |         StandardTokenizer src = new StandardTokenizer(); | ||||||
|         final PayloadEncoder encoder = new PayloadEncoder() { |         TokenFilter filter = new StandardFilter(src); | ||||||
|             @Override |         filter = new LowerCaseFilter(filter); | ||||||
|             public BytesRef encode(char[] buffer) { |  | ||||||
|                 final Float payload = Float.valueOf(new String(buffer)); |  | ||||||
|                 System.out.println(payload); |  | ||||||
|                 final byte[] bytes = PayloadHelper.encodeFloat(payload); |  | ||||||
|                 return new BytesRef(bytes, 0, bytes.length); |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             @Override |  | ||||||
|             public BytesRef encode(char[] buffer, int offset, int length) { |  | ||||||
|  |  | ||||||
|                 final Float payload = Float.valueOf(new String(buffer, offset, length)); |  | ||||||
|                 System.out.println(payload); |  | ||||||
|                 final byte[] bytes = PayloadHelper.encodeFloat(payload); |  | ||||||
|  |  | ||||||
|                 return new BytesRef(bytes, 0, bytes.length); |  | ||||||
|             } |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         TokenFilter filter = new LowerCaseFilter(src); |  | ||||||
|         filter = new MorphologyFilter(filter, luceneMorph); |         filter = new MorphologyFilter(filter, luceneMorph); | ||||||
|  |  | ||||||
|         return new TokenStreamComponents(src::setReader, filter); |         return new TokenStreamComponents(src, filter) { | ||||||
|  |             @Override | ||||||
|  |             protected void setReader(final Reader reader) throws IOException { | ||||||
|  |                 super.setReader(reader); | ||||||
|             } |             } | ||||||
|  |         }; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |  | ||||||
| } | } | ||||||
|   | |||||||
| @@ -19,22 +19,18 @@ package org.apache.lucene.morphology.analyzer; | |||||||
| import org.apache.lucene.analysis.TokenFilter; | import org.apache.lucene.analysis.TokenFilter; | ||||||
| import org.apache.lucene.analysis.TokenStream; | import org.apache.lucene.analysis.TokenStream; | ||||||
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||||||
| import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; |  | ||||||
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | ||||||
| import org.apache.lucene.morphology.LuceneMorphology; | import org.apache.lucene.morphology.LuceneMorphology; | ||||||
|  |  | ||||||
| import java.io.IOException; | import java.io.IOException; | ||||||
| import java.util.Iterator; | import java.util.Iterator; | ||||||
| import java.util.List; |  | ||||||
|  |  | ||||||
|  |  | ||||||
| public class MorphologyFilter extends TokenFilter { | public class MorphologyFilter extends TokenFilter { | ||||||
|     private LuceneMorphology luceneMorph; |     private LuceneMorphology luceneMorph; | ||||||
|     private Iterator<String> iterator; |     private Iterator<String> iterator; | ||||||
|     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); | ||||||
|     private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); |  | ||||||
|     private final PositionIncrementAttribute position = addAttribute(PositionIncrementAttribute.class); |     private final PositionIncrementAttribute position = addAttribute(PositionIncrementAttribute.class); | ||||||
|     private State state = null; |  | ||||||
|  |  | ||||||
|     public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) { |     public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) { | ||||||
|         super(tokenStream); |         super(tokenStream); | ||||||
| @@ -43,45 +39,27 @@ public class MorphologyFilter extends TokenFilter { | |||||||
|  |  | ||||||
|  |  | ||||||
|     final public boolean incrementToken() throws IOException { |     final public boolean incrementToken() throws IOException { | ||||||
|         if (iterator != null) { |         boolean oldToken = true; | ||||||
|             if (iterator.hasNext()) { |         while (iterator == null || !iterator.hasNext()) { | ||||||
|                 restoreState(state); |  | ||||||
|                 position.setPositionIncrement(0); |  | ||||||
|                 termAtt.setEmpty().append(iterator.next()); |  | ||||||
|                 return true; |  | ||||||
|             } else { |  | ||||||
|                 state = null; |  | ||||||
|                 iterator = null; |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|         while (true) { |  | ||||||
|             boolean b = input.incrementToken(); |             boolean b = input.incrementToken(); | ||||||
|             if (!b) { |             if (!b) { | ||||||
|                 return false; |                 return false; | ||||||
|             } |             } | ||||||
|             if (!keywordAttr.isKeyword() && termAtt.length() > 0) { |  | ||||||
|             String s = new String(termAtt.buffer(), 0, termAtt.length()); |             String s = new String(termAtt.buffer(), 0, termAtt.length()); | ||||||
|             if (luceneMorph.checkString(s)) { |             if (luceneMorph.checkString(s)) { | ||||||
|                     List<String> forms = luceneMorph.getNormalForms(s); |                 oldToken = false; | ||||||
|                     if (forms.isEmpty()) { |                 iterator = luceneMorph.getNormalForms(s).iterator(); | ||||||
|                         continue; |  | ||||||
|                     } else if (forms.size() == 1) { |  | ||||||
|                         termAtt.setEmpty().append(forms.get(0)); |  | ||||||
|             } else { |             } else { | ||||||
|                         state = captureState(); |  | ||||||
|                         iterator = forms.iterator(); |  | ||||||
|                         termAtt.setEmpty().append(iterator.next()); |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|                 return true; |                 return true; | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |         String s = iterator.next(); | ||||||
|     @Override |         termAtt.setEmpty(); | ||||||
|     public void reset() throws IOException { |         termAtt.append(s); | ||||||
|         super.reset(); |         if (oldToken) { | ||||||
|         state = null; |             position.setPositionIncrement(0); | ||||||
|         iterator = null; |  | ||||||
|         } |         } | ||||||
|  |         return true; | ||||||
|  |     } | ||||||
|  |  | ||||||
| } | } | ||||||
|   | |||||||
							
								
								
									
										137
									
								
								pom.xml
									
									
									
									
									
								
							
							
						
						
									
										137
									
								
								pom.xml
									
									
									
									
									
								
							| @@ -1,10 +1,11 @@ | |||||||
| <?xml version="1.0" encoding="UTF-8"?> | <?xml version="1.0" encoding="UTF-8"?> | ||||||
| <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||||||
|  |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> | ||||||
|     <modelVersion>4.0.0</modelVersion> |     <modelVersion>4.0.0</modelVersion> | ||||||
|     <groupId>org.apache.lucene.morphology</groupId> |     <groupId>org.apache.lucene.morphology</groupId> | ||||||
|     <artifactId>morphology</artifactId> |     <artifactId>morphology</artifactId> | ||||||
|     <packaging>pom</packaging> |     <packaging>pom</packaging> | ||||||
|     <version>1.5</version> |     <version>1.2-SNAPSHOT</version> | ||||||
|     <name>morphology</name> |     <name>morphology</name> | ||||||
|     <url>http://maven.apache.org</url> |     <url>http://maven.apache.org</url> | ||||||
|  |  | ||||||
| @@ -15,45 +16,35 @@ | |||||||
|         <tag>HEAD</tag> |         <tag>HEAD</tag> | ||||||
|     </scm> |     </scm> | ||||||
|  |  | ||||||
|     <properties> |     <distributionManagement> | ||||||
|         <lucene.version>9.3.0</lucene.version> |         <repository> | ||||||
|         <morphology.version>1.5</morphology.version> |             <id>bintray</id> | ||||||
|         <junit.version>4.13</junit.version> |             <url>https://api.bintray.com/maven/akuznetsov/russianmorphology/morphology</url> | ||||||
|         <maven.compiler.source>21</maven.compiler.source> |         </repository> | ||||||
|         <maven.compiler.target>21</maven.compiler.target> |     </distributionManagement> | ||||||
|         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |  | ||||||
|     </properties> |  | ||||||
|  |  | ||||||
|     <licenses> |  | ||||||
|         <license> |  | ||||||
|             <name>Apache License, Version 2.0</name> |  | ||||||
|             <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url> |  | ||||||
|             <distribution>repo</distribution> |  | ||||||
|         </license> |  | ||||||
|     </licenses> |  | ||||||
|  |  | ||||||
|     <dependencies> |     <dependencies> | ||||||
|         <dependency> |         <dependency> | ||||||
|             <groupId>org.apache.lucene</groupId> |             <groupId>junit</groupId> | ||||||
|             <artifactId>lucene-test-framework</artifactId> |             <artifactId>junit</artifactId> | ||||||
|             <version>${lucene.version}</version> |             <version>4.8.2</version> | ||||||
|             <scope>test</scope> |             <scope>test</scope> | ||||||
|         </dependency> |         </dependency> | ||||||
|         <dependency> |         <dependency> | ||||||
|             <groupId>org.hamcrest</groupId> |             <groupId>org.hamcrest</groupId> | ||||||
|             <artifactId>hamcrest-all</artifactId> |             <artifactId>hamcrest-all</artifactId> | ||||||
|             <version>1.3</version> |             <version>1.1</version> | ||||||
|             <scope>test</scope> |             <scope>test</scope> | ||||||
|         </dependency> |         </dependency> | ||||||
|         <dependency> |         <dependency> | ||||||
|             <groupId>org.apache.lucene</groupId> |             <groupId>org.apache.lucene</groupId> | ||||||
|             <artifactId>lucene-core</artifactId> |             <artifactId>lucene-core</artifactId> | ||||||
|             <version>${lucene.version}</version> |             <version>5.1.0</version> | ||||||
|         </dependency> |         </dependency> | ||||||
|         <dependency> |         <dependency> | ||||||
|             <groupId>org.apache.lucene</groupId> |             <groupId>org.apache.lucene</groupId> | ||||||
|             <artifactId>lucene-analysis-common</artifactId> |             <artifactId>lucene-analyzers-common</artifactId> | ||||||
|             <version>${lucene.version}</version> |             <version>5.1.0</version> | ||||||
|         </dependency> |         </dependency> | ||||||
|     </dependencies> |     </dependencies> | ||||||
|  |  | ||||||
| @@ -61,11 +52,11 @@ | |||||||
|         <repository> |         <repository> | ||||||
|             <id>maven2-repository.dev.java.net</id> |             <id>maven2-repository.dev.java.net</id> | ||||||
|             <name>Java.net Repository for Maven</name> |             <name>Java.net Repository for Maven</name> | ||||||
|             <url>https://download.java.net/maven/2/</url> |             <url>http://download.java.net/maven/2/</url> | ||||||
|         </repository> |         </repository> | ||||||
|         <repository> |         <repository> | ||||||
|             <id>bintray</id> |             <id>bintray</id> | ||||||
|             <url>https://dl.bintray.com/akuznetsov/russianmorphology</url> |             <url>http://dl.bintray.com/akuznetsov/russianmorphology</url> | ||||||
|             <releases> |             <releases> | ||||||
|                 <enabled>true</enabled> |                 <enabled>true</enabled> | ||||||
|             </releases> |             </releases> | ||||||
| @@ -75,24 +66,12 @@ | |||||||
|         </repository> |         </repository> | ||||||
|     </repositories> |     </repositories> | ||||||
|  |  | ||||||
|     <pluginRepositories> |  | ||||||
|         <pluginRepository> |  | ||||||
|             <id>mc-release</id> |  | ||||||
|             <name>maven-license-plugin repository of releases</name> |  | ||||||
|             <url>https://mc-repo.googlecode.com/svn/maven2/releases</url> |  | ||||||
|             <snapshots> |  | ||||||
|                 <enabled>false</enabled> |  | ||||||
|             </snapshots> |  | ||||||
|             <releases> |  | ||||||
|                 <enabled>true</enabled> |  | ||||||
|             </releases> |  | ||||||
|         </pluginRepository> |  | ||||||
|     </pluginRepositories> |  | ||||||
|     <build> |     <build> | ||||||
|         <plugins> |         <plugins> | ||||||
|             <plugin> |             <plugin> | ||||||
|                 <artifactId>maven-release-plugin</artifactId> |                 <artifactId>maven-release-plugin</artifactId> | ||||||
|                 <version>2.5.3</version> |                 <version>2.5.2</version> | ||||||
|                 <configuration> |                 <configuration> | ||||||
|                     <useReleaseProfile>false</useReleaseProfile> |                     <useReleaseProfile>false</useReleaseProfile> | ||||||
|                     <releaseProfiles>release</releaseProfiles> |                     <releaseProfiles>release</releaseProfiles> | ||||||
| @@ -103,37 +82,42 @@ | |||||||
|             <plugin> |             <plugin> | ||||||
|                 <groupId>org.apache.maven.plugins</groupId> |                 <groupId>org.apache.maven.plugins</groupId> | ||||||
|                 <artifactId>maven-compiler-plugin</artifactId> |                 <artifactId>maven-compiler-plugin</artifactId> | ||||||
|                 <version>3.8.1</version> |  | ||||||
|                 <configuration> |                 <configuration> | ||||||
|                     <source>11</source> |                     <source>1.7</source> | ||||||
|                     <target>11</target> |                     <target>1.7</target> | ||||||
|                 </configuration> |                 </configuration> | ||||||
|             </plugin> |             </plugin> | ||||||
|             <plugin>                <!--                 usage: http://code.google.com/p/maven-license-plugin/wiki/HowTo                --> |             <!--<plugin>                <!–                 usage: http://code.google.com/p/maven-license-plugin/wiki/HowTo                –>--> | ||||||
|                 <artifactId>maven-license-plugin</artifactId> |  | ||||||
|                 <groupId>com.google.code.maven-license-plugin</groupId> |                 <!--<groupId>com.mycila</groupId>--> | ||||||
|                 <version>1.4.0</version> |                 <!--<artifactId>license-maven-plugin</artifactId>--> | ||||||
|                 <configuration> |                 <!--<version>2.11</version>--> | ||||||
|                     <basedir>${project.parent.basedir}</basedir> |  | ||||||
|                     <header>etc/header.txt</header> |                 <!--<configuration>--> | ||||||
|                     <excludes> |                     <!--<properties>--> | ||||||
|                         <exclude>**/*.txt</exclude> |                         <!--<owner>Alexander Kuznetsov</owner>--> | ||||||
|                         <exclude>**/*.info</exclude> |                         <!--<!–<email>mathieu.carbou@gmail.com</email>–>--> | ||||||
|                         <exclude>**/pom.xml</exclude> |                     <!--</properties>--> | ||||||
|                     </excludes> |                     <!--<basedir>${project.parent.basedir}</basedir>--> | ||||||
|                     <includes> |                     <!--<header>etc/header.txt</header>--> | ||||||
|                         <include>**/src/**</include> |                     <!--<excludes>--> | ||||||
|                     </includes> |                         <!--<exclude>**/*.txt</exclude>--> | ||||||
|                 </configuration> |                         <!--<exclude>**/*.info</exclude>--> | ||||||
|                 <executions> |                         <!--<exclude>**/pom.xml</exclude>--> | ||||||
|                     <execution> |                     <!--</excludes>--> | ||||||
|                         <phase>test</phase> |                     <!--<includes>--> | ||||||
|                         <goals> |                         <!--<include>**/src/**</include>--> | ||||||
|                             <goal>check</goal> |                     <!--</includes>--> | ||||||
|                         </goals> |                 <!--</configuration>--> | ||||||
|                     </execution> |                 <!--<executions>--> | ||||||
|                 </executions> |                     <!--<execution>--> | ||||||
|             </plugin> |                         <!--<phase>test</phase>--> | ||||||
|  |                         <!--<goals>--> | ||||||
|  |                             <!--<goal>check</goal>--> | ||||||
|  |                         <!--</goals>--> | ||||||
|  |                     <!--</execution>--> | ||||||
|  |                 <!--</executions>--> | ||||||
|  |             <!--</plugin>--> | ||||||
|         </plugins> |         </plugins> | ||||||
|     </build> |     </build> | ||||||
|     <profiles> |     <profiles> | ||||||
| @@ -143,7 +127,6 @@ | |||||||
|                 <plugins> |                 <plugins> | ||||||
|                     <plugin> |                     <plugin> | ||||||
|                         <artifactId>maven-source-plugin</artifactId> |                         <artifactId>maven-source-plugin</artifactId> | ||||||
|                         <version>3.2.1</version> |  | ||||||
|                         <executions> |                         <executions> | ||||||
|                             <execution> |                             <execution> | ||||||
|                                 <id>attach-sources</id> |                                 <id>attach-sources</id> | ||||||
| @@ -155,7 +138,6 @@ | |||||||
|                     </plugin> |                     </plugin> | ||||||
|                     <plugin> |                     <plugin> | ||||||
|                         <artifactId>maven-javadoc-plugin</artifactId> |                         <artifactId>maven-javadoc-plugin</artifactId> | ||||||
|                         <version>3.3.1</version> |  | ||||||
|                         <executions> |                         <executions> | ||||||
|                             <execution> |                             <execution> | ||||||
|                                 <id>attach-javadocs</id> |                                 <id>attach-javadocs</id> | ||||||
| @@ -174,17 +156,6 @@ | |||||||
|         <module>dictionary-reader</module> |         <module>dictionary-reader</module> | ||||||
|         <module>russian</module> |         <module>russian</module> | ||||||
|         <module>english</module> |         <module>english</module> | ||||||
|         <module>solr-morphology-analysis</module> |         <module>context</module> | ||||||
|     </modules> |     </modules> | ||||||
|  |  | ||||||
|     <distributionManagement> |  | ||||||
|         <repository> |  | ||||||
|             <id>gitea</id> |  | ||||||
|             <url>https://git.kuksa.dev/api/packages/edkuksa/maven</url> |  | ||||||
|         </repository> |  | ||||||
|         <snapshotRepository> |  | ||||||
|             <id>gitea</id> |  | ||||||
|             <url>https://git.kuksa.dev/api/packages/edkuksa/maven</url> |  | ||||||
|         </snapshotRepository> |  | ||||||
|     </distributionManagement> |  | ||||||
| </project> | </project> | ||||||
| @@ -3,12 +3,13 @@ | |||||||
|     <parent> |     <parent> | ||||||
|         <artifactId>morphology</artifactId> |         <artifactId>morphology</artifactId> | ||||||
|         <groupId>org.apache.lucene.morphology</groupId> |         <groupId>org.apache.lucene.morphology</groupId> | ||||||
|         <version>1.5</version> |         <version>1.2-SNAPSHOT</version> | ||||||
|     </parent> |     </parent> | ||||||
|     <modelVersion>4.0.0</modelVersion> |     <modelVersion>4.0.0</modelVersion> | ||||||
|  |     <groupId>org.apache.lucene.morphology</groupId> | ||||||
|     <artifactId>russian</artifactId> |     <artifactId>russian</artifactId> | ||||||
|     <name>russian</name> |     <name>russian</name> | ||||||
|     <version>1.5</version> |     <version>1.2-SNAPSHOT</version> | ||||||
|     <url>http://maven.apache.org</url> |     <url>http://maven.apache.org</url> | ||||||
|     <dependencies> |     <dependencies> | ||||||
|  |  | ||||||
| @@ -16,13 +17,13 @@ | |||||||
|         <dependency> |         <dependency> | ||||||
|             <groupId>org.apache.lucene.morphology</groupId> |             <groupId>org.apache.lucene.morphology</groupId> | ||||||
|             <artifactId>morph</artifactId> |             <artifactId>morph</artifactId> | ||||||
|             <version>1.5</version> |             <version>1.2-SNAPSHOT</version> | ||||||
|         </dependency> |         </dependency> | ||||||
|  |  | ||||||
|         <dependency> |         <dependency> | ||||||
|             <groupId>junit</groupId> |             <groupId>junit</groupId> | ||||||
|             <artifactId>junit</artifactId> |             <artifactId>junit</artifactId> | ||||||
|             <version>${junit.version}</version> |             <version>4.8.2</version> | ||||||
|             <scope>test</scope> |             <scope>test</scope> | ||||||
|         </dependency> |         </dependency> | ||||||
|  |  | ||||||
|   | |||||||
| @@ -20,6 +20,7 @@ import org.apache.lucene.morphology.LetterDecoderEncoder; | |||||||
| import org.apache.lucene.morphology.SuffixToLongException; | import org.apache.lucene.morphology.SuffixToLongException; | ||||||
| import org.apache.lucene.morphology.WrongCharaterException; | import org.apache.lucene.morphology.WrongCharaterException; | ||||||
|  |  | ||||||
|  | import java.util.ArrayList; | ||||||
| import java.util.LinkedList; | import java.util.LinkedList; | ||||||
|  |  | ||||||
| /** | /** | ||||||
| @@ -41,7 +42,7 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { | |||||||
|             throw new SuffixToLongException("Suffix length should not be greater then " + WORD_PART_LENGHT + " " + string); |             throw new SuffixToLongException("Suffix length should not be greater then " + WORD_PART_LENGHT + " " + string); | ||||||
|         int result = 0; |         int result = 0; | ||||||
|         for (int i = 0; i < string.length(); i++) { |         for (int i = 0; i < string.length(); i++) { | ||||||
|             int c = string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET; |             int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET; | ||||||
|             if (c == 45 - RUSSIAN_SMALL_LETTER_OFFSET) { |             if (c == 45 - RUSSIAN_SMALL_LETTER_OFFSET) { | ||||||
|                 c = DASH_CODE; |                 c = DASH_CODE; | ||||||
|             } |             } | ||||||
| @@ -57,7 +58,7 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     public int[] encodeToArray(String s) { |     public int[] encodeToArray(String s) { | ||||||
|         LinkedList<Integer> integers = new LinkedList<>(); |         LinkedList<Integer> integers = new LinkedList<Integer>(); | ||||||
|         while (s.length() > WORD_PART_LENGHT) { |         while (s.length() > WORD_PART_LENGHT) { | ||||||
|             integers.add(encode(s.substring(0, WORD_PART_LENGHT))); |             integers.add(encode(s.substring(0, WORD_PART_LENGHT))); | ||||||
|             s = s.substring(WORD_PART_LENGHT); |             s = s.substring(WORD_PART_LENGHT); | ||||||
| @@ -73,16 +74,16 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     public String decodeArray(int[] array) { |     public String decodeArray(int[] array) { | ||||||
|         StringBuilder result = new StringBuilder(); |         String result = ""; | ||||||
|         for (int i : array) { |         for (int i : array) { | ||||||
|             result.append(decode(i)); |             result += decode(i); | ||||||
|         } |         } | ||||||
|         return result.toString(); |         return result; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|     public String decode(Integer suffixN) { |     public String decode(Integer suffixN) { | ||||||
|         StringBuilder result = new StringBuilder(); |         String result = ""; | ||||||
|         while (suffixN > 33) { |         while (suffixN > 33) { | ||||||
|             int c = suffixN % 34 + RUSSIAN_SMALL_LETTER_OFFSET; |             int c = suffixN % 34 + RUSSIAN_SMALL_LETTER_OFFSET; | ||||||
|             if (c == RUSSIAN_SMALL_LETTER_OFFSET) { |             if (c == RUSSIAN_SMALL_LETTER_OFFSET) { | ||||||
| @@ -90,20 +91,21 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { | |||||||
|                 continue; |                 continue; | ||||||
|             } |             } | ||||||
|             if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR; |             if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR; | ||||||
|             result.insert(0, (char) c); |             result = (char) c + result; | ||||||
|             suffixN /= 34; |             suffixN /= 34; | ||||||
|         } |         } | ||||||
|         long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET; |         long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET; | ||||||
|         if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR; |         if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR; | ||||||
|         result.insert(0, (char) c); |         result = (char) c + result; | ||||||
|         return result.toString(); |         return result; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     public boolean checkCharacter(char c) { |     public boolean checkCharacter(char c) { | ||||||
|         int code = c; |         int code = 0 + c; | ||||||
|         if (code == 45) return true; |         if (code == 45) return true; | ||||||
|         code -= RUSSIAN_SMALL_LETTER_OFFSET; |         code -= RUSSIAN_SMALL_LETTER_OFFSET; | ||||||
|         return code > 0 && code < 33; |         if (code > 0 && code < 33) return true; | ||||||
|  |         return false; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     public boolean checkString(String word) { |     public boolean checkString(String word) { | ||||||
|   | |||||||
| @@ -17,7 +17,6 @@ package org.apache.lucene.morphology.russian; | |||||||
|  |  | ||||||
| import org.apache.lucene.morphology.SuffixToLongException; | import org.apache.lucene.morphology.SuffixToLongException; | ||||||
| import org.apache.lucene.morphology.WrongCharaterException; | import org.apache.lucene.morphology.WrongCharaterException; | ||||||
| import org.hamcrest.MatcherAssert; |  | ||||||
| import org.junit.Before; | import org.junit.Before; | ||||||
| import org.junit.Test; | import org.junit.Test; | ||||||
|  |  | ||||||
| @@ -25,9 +24,9 @@ import java.io.BufferedReader; | |||||||
| import java.io.IOException; | import java.io.IOException; | ||||||
| import java.io.InputStream; | import java.io.InputStream; | ||||||
| import java.io.InputStreamReader; | import java.io.InputStreamReader; | ||||||
| import java.nio.charset.StandardCharsets; |  | ||||||
|  |  | ||||||
| import static org.hamcrest.core.IsEqual.equalTo; | import static org.hamcrest.core.IsEqual.equalTo; | ||||||
|  | import static org.junit.Assert.assertThat; | ||||||
|  |  | ||||||
| public class RussianLetterDecoderEncoderTest { | public class RussianLetterDecoderEncoderTest { | ||||||
|     private RussianLetterDecoderEncoder decoderEncoder; |     private RussianLetterDecoderEncoder decoderEncoder; | ||||||
| @@ -41,12 +40,12 @@ public class RussianLetterDecoderEncoderTest { | |||||||
|     @Test |     @Test | ||||||
|     public void testShouldPreserverStringComporision() throws IOException { |     public void testShouldPreserverStringComporision() throws IOException { | ||||||
|         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-monotonic.txt"); |         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-monotonic.txt"); | ||||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); |         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); | ||||||
|         String s = bufferedReader.readLine(); |         String s = bufferedReader.readLine(); | ||||||
|         while (s != null) { |         while (s != null) { | ||||||
|             String[] qa = s.trim().split(" "); |             String[] qa = s.trim().split(" "); | ||||||
|             if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT && qa[1].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) { |             if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT && qa[1].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) { | ||||||
|                 MatcherAssert.assertThat(decoderEncoder.encode(qa[1]) > decoderEncoder.encode(qa[0]), equalTo(true)); |                 assertThat(decoderEncoder.encode(qa[1]) > decoderEncoder.encode(qa[0]), equalTo(true)); | ||||||
|             } |             } | ||||||
|             s = bufferedReader.readLine(); |             s = bufferedReader.readLine(); | ||||||
|         } |         } | ||||||
| @@ -56,13 +55,13 @@ public class RussianLetterDecoderEncoderTest { | |||||||
|     @Test |     @Test | ||||||
|     public void testShouldCorrectDecodeEncode() throws IOException { |     public void testShouldCorrectDecodeEncode() throws IOException { | ||||||
|         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data.txt"); |         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data.txt"); | ||||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); |         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); | ||||||
|         String s = bufferedReader.readLine(); |         String s = bufferedReader.readLine(); | ||||||
|         while (s != null) { |         while (s != null) { | ||||||
|             String[] qa = s.trim().split(" "); |             String[] qa = s.trim().split(" "); | ||||||
|             if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) { |             if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) { | ||||||
|                 Integer encodedSuffix = decoderEncoder.encode(qa[0]); |                 Integer encodedSuffix = decoderEncoder.encode(qa[0]); | ||||||
|                 MatcherAssert.assertThat(decoderEncoder.decode(encodedSuffix), equalTo(qa[1])); |                 assertThat(decoderEncoder.decode(encodedSuffix), equalTo(qa[1])); | ||||||
|             } |             } | ||||||
|             s = bufferedReader.readLine(); |             s = bufferedReader.readLine(); | ||||||
|         } |         } | ||||||
| @@ -71,12 +70,12 @@ public class RussianLetterDecoderEncoderTest { | |||||||
|     @Test |     @Test | ||||||
|     public void testShouldCorrectDecodeEncodeStringToArray() throws IOException { |     public void testShouldCorrectDecodeEncodeStringToArray() throws IOException { | ||||||
|         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data-for-array.txt"); |         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data-for-array.txt"); | ||||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); |         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); | ||||||
|         String s = bufferedReader.readLine(); |         String s = bufferedReader.readLine(); | ||||||
|         while (s != null) { |         while (s != null) { | ||||||
|             String[] qa = s.trim().split(" "); |             String[] qa = s.trim().split(" "); | ||||||
|             int[] ecodedSuffix = decoderEncoder.encodeToArray(qa[0]); |             int[] ecodedSuffix = decoderEncoder.encodeToArray(qa[0]); | ||||||
|             MatcherAssert.assertThat(decoderEncoder.decodeArray(ecodedSuffix), equalTo(qa[1])); |             assertThat(decoderEncoder.decodeArray(ecodedSuffix), equalTo(qa[1])); | ||||||
|             s = bufferedReader.readLine(); |             s = bufferedReader.readLine(); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -1,70 +0,0 @@ | |||||||
| /** |  | ||||||
|  * Copyright 2009 Alexander Kuznetsov |  | ||||||
|  * |  | ||||||
|  * Licensed under the Apache License, Version 2.0 (the "License"); |  | ||||||
|  * you may not use this file except in compliance with the License. |  | ||||||
|  * You may obtain a copy of the License at |  | ||||||
|  * |  | ||||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 |  | ||||||
|  * |  | ||||||
|  * Unless required by applicable law or agreed to in writing, software |  | ||||||
|  * distributed under the License is distributed on an "AS IS" BASIS, |  | ||||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |  | ||||||
|  * See the License for the specific language governing permissions and |  | ||||||
|  * limitations under the License. |  | ||||||
|  */ |  | ||||||
| package org.apache.lucene.analysis.morphology; |  | ||||||
|  |  | ||||||
| import org.apache.lucene.analysis.TokenFilterFactory; |  | ||||||
| import org.apache.lucene.analysis.TokenStream; |  | ||||||
|  |  | ||||||
| import org.apache.lucene.morphology.LuceneMorphology; |  | ||||||
| import org.apache.lucene.morphology.analyzer.MorphologyFilter; |  | ||||||
| import org.apache.lucene.util.ResourceLoader; |  | ||||||
| import org.apache.lucene.util.ResourceLoaderAware; |  | ||||||
|  |  | ||||||
| import java.util.Map; |  | ||||||
|  |  | ||||||
| /** |  | ||||||
|  * Factory for {@link MorphologyFilter}, with configurable language |  | ||||||
|  * <p> |  | ||||||
|  * <b>Note:</b> Two languages are available now: English (default value) and Russian. |  | ||||||
|  * <pre class="prettyprint"> |  | ||||||
|  * <fieldType name="content" class="solr.TextField" positionIncrementGap="100"> |  | ||||||
|  *   <analyzer> |  | ||||||
|  *     <tokenizer class="solr.StandardTokenizerFactory"/> |  | ||||||
|  *     <filter class="solr.LowerCaseFilterFactory"/> |  | ||||||
|  *     <filter class="solr.MorphologyFilterFactory" language="English"/> |  | ||||||
|  *   </analyzer> |  | ||||||
|  * </fieldType></pre> |  | ||||||
|  */ |  | ||||||
| public class MorphologyFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { |  | ||||||
|  |  | ||||||
|     private static final String LANGUAGE_KEY = "language"; |  | ||||||
|  |  | ||||||
|     private String language; |  | ||||||
|     private LuceneMorphology luceneMorphology; |  | ||||||
|  |  | ||||||
|     public MorphologyFilterFactory(Map<String, String> args) { |  | ||||||
|         super(args); |  | ||||||
|  |  | ||||||
|         language = get(args, LANGUAGE_KEY, "English"); |  | ||||||
|         if (!args.isEmpty()) { |  | ||||||
|             throw new IllegalArgumentException("Unknown parameters: " + args); |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     public TokenStream create(TokenStream input) { |  | ||||||
|         return new MorphologyFilter(input, luceneMorphology); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     public void inform(ResourceLoader loader) { |  | ||||||
|  |  | ||||||
|         String className = "org.apache.lucene.morphology." + language.toLowerCase() + "." + language + "LuceneMorphology"; |  | ||||||
|         luceneMorphology = loader.newInstance(className, LuceneMorphology.class); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     public LuceneMorphology getLuceneMorphology() { |  | ||||||
|         return luceneMorphology; |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| @@ -1,75 +0,0 @@ | |||||||
| /** |  | ||||||
|  * Copyright 2009 Alexander Kuznetsov |  | ||||||
|  * |  | ||||||
|  * Licensed under the Apache License, Version 2.0 (the "License"); |  | ||||||
|  * you may not use this file except in compliance with the License. |  | ||||||
|  * You may obtain a copy of the License at |  | ||||||
|  * |  | ||||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 |  | ||||||
|  * |  | ||||||
|  * Unless required by applicable law or agreed to in writing, software |  | ||||||
|  * distributed under the License is distributed on an "AS IS" BASIS, |  | ||||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |  | ||||||
|  * See the License for the specific language governing permissions and |  | ||||||
|  * limitations under the License. |  | ||||||
|  */ |  | ||||||
| package org.apache.lucene.analysis.morphology; |  | ||||||
|  |  | ||||||
| import org.apache.lucene.morphology.LuceneMorphology; |  | ||||||
| import org.apache.lucene.morphology.english.EnglishLuceneMorphology; |  | ||||||
| import org.apache.lucene.morphology.russian.RussianLuceneMorphology; |  | ||||||
| import org.apache.lucene.util.ClasspathResourceLoader; |  | ||||||
| import org.apache.lucene.util.ResourceLoader; |  | ||||||
| import org.junit.Assert; |  | ||||||
| import org.junit.Before; |  | ||||||
| import org.junit.Test; |  | ||||||
|  |  | ||||||
| import java.util.HashMap; |  | ||||||
| import java.util.Map; |  | ||||||
|  |  | ||||||
| public class MorphologyFilterFactoryTest { |  | ||||||
|  |  | ||||||
|     private static final String LANGUAGE_KEY = "language"; |  | ||||||
|     private ResourceLoader loader = new ClasspathResourceLoader(MorphologyFilterFactoryTest.class); |  | ||||||
|     private Map<String, String> args; |  | ||||||
|  |  | ||||||
|     @Before |  | ||||||
|     public void setUp() { |  | ||||||
|         args = new HashMap<>(); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     @Test |  | ||||||
|     public void if_RussianLanguageKey_then_CreateRussianMorphologyFilter() { |  | ||||||
|  |  | ||||||
|         args.put(LANGUAGE_KEY, "Russian"); |  | ||||||
|         MorphologyFilterFactory morphologyFilterFactory = new MorphologyFilterFactory(args); |  | ||||||
|         morphologyFilterFactory.inform(loader); |  | ||||||
|  |  | ||||||
|         LuceneMorphology luceneMorphology = morphologyFilterFactory.getLuceneMorphology(); |  | ||||||
|  |  | ||||||
|         Assert.assertTrue("Creation the MorphologyFilterFactory with a Russian language key", luceneMorphology instanceof RussianLuceneMorphology); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     @Test |  | ||||||
|     public void if_EnglishLanguageKey_then_CreateEnglishMorphologyFilter() { |  | ||||||
|  |  | ||||||
|         args.put(LANGUAGE_KEY, "English"); |  | ||||||
|         MorphologyFilterFactory morphologyFilterFactory = new MorphologyFilterFactory(args); |  | ||||||
|         morphologyFilterFactory.inform(loader); |  | ||||||
|  |  | ||||||
|         LuceneMorphology luceneMorphology = morphologyFilterFactory.getLuceneMorphology(); |  | ||||||
|  |  | ||||||
|         Assert.assertTrue("Creation the MorphologyFilterFactory with a English language key", luceneMorphology instanceof EnglishLuceneMorphology); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     @Test |  | ||||||
|     public void if_NoLanguageKey_then_CreateEnglishMorphologyFilter() { |  | ||||||
|  |  | ||||||
|         MorphologyFilterFactory morphologyFilterFactory = new MorphologyFilterFactory(args); |  | ||||||
|         morphologyFilterFactory.inform(loader); |  | ||||||
|  |  | ||||||
|         LuceneMorphology luceneMorphology = morphologyFilterFactory.getLuceneMorphology(); |  | ||||||
|  |  | ||||||
|         Assert.assertTrue("Creation the MorphologyFilterFactory without any language keys", luceneMorphology instanceof EnglishLuceneMorphology); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
		Reference in New Issue
	
	Block a user