Compare commits
2 Commits
master
...
ambiguousl
Author | SHA1 | Date | |
---|---|---|---|
![]() |
f095cbe7c0 | ||
![]() |
3b2e48821a |
35
.github/workflows/main.yaml
vendored
35
.github/workflows/main.yaml
vendored
@ -1,35 +0,0 @@
|
||||
name: Java CI
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
tests:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up JDK 11
|
||||
uses: actions/setup-java@v2
|
||||
with:
|
||||
java-version: '11'
|
||||
distribution: 'adopt'
|
||||
- name: Build with Maven
|
||||
run: mvn --batch-mode --update-snapshots verify
|
||||
|
||||
pack-artifacts:
|
||||
runs-on: ubuntu-latest
|
||||
needs: tests
|
||||
if: github.ref == 'refs/heads/master'
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up JDK 11
|
||||
uses: actions/setup-java@v2
|
||||
with:
|
||||
java-version: '11'
|
||||
distribution: 'adopt'
|
||||
- name: Build with Maven
|
||||
run: mvn --batch-mode --update-snapshots verify
|
||||
- uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: artifacts
|
||||
path: ${{ github.workspace }}/*/target/*.jar
|
202
LICENSE
202
LICENSE
@ -1,202 +0,0 @@
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
64
README.md
64
README.md
@ -1,17 +1,54 @@
|
||||
# Russian Morphology for Apache Lucene
|
||||
# Russian Morphology for lucene
|
||||
|
||||
Russian and English morphology for Java and [Apache Lucene](http://lucene.apache.org) 9.3 framework based on open source dictionary from site [АОТ](http://aot.ru). It uses dictionary base morphology with some heuristics for unknown words. It supports a homonym for example for a Russian word "вина" it gives two variants "вино" and "вина".
|
||||
Russian and English morphology for java and lucene 3.0 framework based on open source dictionary from site [АОТ](http://aot.ru). It use dictionary base morphology with some heuristics for unknown words. It support homonym for example for Russian word "вина" it gives two variants "вино" and "вина".
|
||||
|
||||
|
||||
### How to use
|
||||
|
||||
Build project, by running `mvn clean package`, this will provide you the latest versions of the artifacts - 1.5, add it to your classpath. You could select which version to use - Russian or English.
|
||||
First download
|
||||
[morph-1.0.jar](https://bintray.com/artifact/download/akuznetsov/russianmorphology/org/apache/lucene/morphology/morph/1.1/morph-1.1.jar)
|
||||
and add it to your class path. When download [Russian](https://bintray.com/artifact/download/akuznetsov/russianmorphology/org/apache/lucene/morphology/russian/1.1/russian-1.1.jar) or
|
||||
[English](https://bintray.com/artifact/download/akuznetsov/russianmorphology/org/apache/lucene/morphology/english/1.1/english-1.1.jar) package.
|
||||
|
||||
If you use maven you can add dependency
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>russian</artifactId>
|
||||
<version>1.1</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>english</artifactId>
|
||||
<version>1.1</version>
|
||||
</dependency>
|
||||
|
||||
Don't forget add link to repository
|
||||
|
||||
|
||||
<repositories>
|
||||
...............
|
||||
<repository>
|
||||
<snapshots>
|
||||
<enabled>false</enabled>
|
||||
</snapshots>
|
||||
<id>bintray-akuznetsov-russianmorphology</id>
|
||||
<name>bintray</name>
|
||||
<url>http://dl.bintray.com/akuznetsov/russianmorphology</url>
|
||||
</repository>
|
||||
</repositories>
|
||||
|
||||
|
||||
|
||||
Now you can create a Lucene Analyzer
|
||||
|
||||
Now you can create a Lucene Analyzer:
|
||||
|
||||
RussianAnalayzer russian = new RussianAnalayzer();
|
||||
EnglishAnalayzer english = new EnglishAnalayzer();
|
||||
|
||||
|
||||
You can write you own analyzer using filter that convert word in it's right forms.
|
||||
|
||||
LuceneMorphology luceneMorph = new EnglishLuceneMorphology();
|
||||
@ -25,28 +62,9 @@ Also if you need get a list of base forms of word, you can use following example
|
||||
LuceneMorphology luceneMorph = new EnglishLuceneMorphology();
|
||||
List<String> wordBaseForms = luceneMorph.getMorphInfo(word);
|
||||
|
||||
### Solr
|
||||
|
||||
You can use the LuceneMorphology as morphology filter in a Solr _schema.xml_ using a **MorphologyFilterFactory:**
|
||||
|
||||
```xml
|
||||
<fieldType name="content" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="org.apache.lucene.analysis.morphology.MorphologyFilterFactory" language="Russian"/>
|
||||
<filter class="org.apache.lucene.analysis.morphology.MorphologyFilterFactory" language="English"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
```
|
||||
|
||||
Just add _morphology-1.5.jar_ in your Solr lib-directories
|
||||
|
||||
### Restrictions
|
||||
|
||||
* It works only with UTF-8.
|
||||
* It assume what letters е and ё are the same.
|
||||
* Word forms with prefixes like "наибольший" treated as separate word.
|
||||
|
||||
### License
|
||||
|
||||
Apache License, Version 2.0
|
||||
|
@ -1,40 +1,36 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>morphology</artifactId>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<version>1.5</version>
|
||||
</parent>
|
||||
<?xml version="1.0"?>
|
||||
<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
|
||||
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>org.apache.lucene.analysis</groupId>
|
||||
<artifactId>morphology</artifactId>
|
||||
<name>solr-morphology-analysis</name>
|
||||
<version>${morphology.version}</version>
|
||||
<parent>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>morphology</artifactId>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>context</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
<name>context</name>
|
||||
<url>http://maven.apache.org</url>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.8.2</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>russian</artifactId>
|
||||
<version>${morphology.version}</version>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>english</artifactId>
|
||||
<version>${morphology.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>${junit.version}</version>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
</project>
|
||||
</project>
|
@ -0,0 +1,52 @@
|
||||
/**
|
||||
* Copyright 2015 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology.context;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
public class CalculateContextItem {
|
||||
|
||||
public List<ContextItem> createContextItems(String text) throws IOException {
|
||||
Analyzer statAnalyzer = new StatAnalyzer();
|
||||
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год? - и что я жду тебя, где вино".getBytes()), "UTF-8");
|
||||
|
||||
|
||||
// new RussianMorphology();
|
||||
|
||||
TokenStream tokenStream = statAnalyzer.tokenStream(null, reader);
|
||||
tokenStream.reset();
|
||||
|
||||
List<List<String>> listedLink = new LinkedList<>();
|
||||
while (tokenStream.incrementToken()) {
|
||||
CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
@ -0,0 +1,80 @@
|
||||
/**
|
||||
* Copyright 2015 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology.context;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class ContextItem implements Comparable<ContextItem> {
|
||||
String[][] morphInfo;
|
||||
|
||||
public ContextItem(String[][] morphInfo) {
|
||||
this.morphInfo = morphInfo;
|
||||
}
|
||||
|
||||
public String[][] getMorphInfo() {
|
||||
return morphInfo;
|
||||
}
|
||||
|
||||
public void setMorphInfo(String[][] morphInfo) {
|
||||
this.morphInfo = morphInfo;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
int h = 0;
|
||||
for (String[] m : morphInfo) {
|
||||
for (String s : m) {
|
||||
h = 31 * h + s.hashCode();
|
||||
}
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
ContextItem that = (ContextItem) o;
|
||||
|
||||
if (that.morphInfo.length != this.morphInfo.length) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < morphInfo.length; i++) {
|
||||
if (!Arrays.equals(morphInfo[i], that.morphInfo[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compareTo(ContextItem o) {
|
||||
int i = o.morphInfo.length - morphInfo.length;
|
||||
if (i != 0) return i;
|
||||
for (int j = 0; j < morphInfo.length; j++) {
|
||||
i = o.morphInfo[j].length - morphInfo[j].length;
|
||||
if (i != 0) return i;
|
||||
for (int k = 0; k < morphInfo[j].length; k++) {
|
||||
i = morphInfo[j][k].compareTo(o.morphInfo[j][k]);
|
||||
if (i != 0) return i;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
@ -0,0 +1,37 @@
|
||||
/**
|
||||
* Copyright 2015 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology.context;
|
||||
|
||||
public class ContextStats {
|
||||
String[] morphInfo;
|
||||
double prob;
|
||||
|
||||
public String[] getMorphInfo() {
|
||||
return morphInfo;
|
||||
}
|
||||
|
||||
public void setMorphInfo(String[] morphInfo) {
|
||||
this.morphInfo = morphInfo;
|
||||
}
|
||||
|
||||
public double getProb() {
|
||||
return prob;
|
||||
}
|
||||
|
||||
public void setProb(double prob) {
|
||||
this.prob = prob;
|
||||
}
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
/**
|
||||
* Copyright 2015 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology.context;
|
||||
|
||||
public class ProbClalucator {
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,116 @@
|
||||
/**
|
||||
* Copyright 2015 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology.context;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Set;
|
||||
|
||||
public class SimpleTokenizer extends Tokenizer {
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
LinkedList<String> terms;
|
||||
|
||||
public final static Set<Character> SEPARATION_LETTERS = new HashSet<>(Arrays.asList(' ', '(', ')', ',', '|', '\t',
|
||||
'\n', '"', ':', '!', '?', ',', ';', '•'));
|
||||
|
||||
public final static Set<Character> MEANING_CHARS = new HashSet<>(Arrays.asList('(', ')', ',', '|',
|
||||
'"', ':', '!', '?', ',', ';', '•', '.'));
|
||||
|
||||
public SimpleTokenizer() {
|
||||
}
|
||||
|
||||
public SimpleTokenizer(AttributeFactory factory) {
|
||||
super(factory);
|
||||
}
|
||||
|
||||
@Override
|
||||
final public boolean incrementToken() throws IOException {
|
||||
if (terms == null) {
|
||||
createTeams();
|
||||
}
|
||||
if (terms.size() > 0) {
|
||||
String str = terms.poll();
|
||||
termAtt.setEmpty();
|
||||
termAtt.append(str);
|
||||
posAtt.setPositionIncrement(1);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private void createTeams() throws IOException {
|
||||
terms = new LinkedList<>();
|
||||
|
||||
BufferedReader br = new BufferedReader(input);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String s = "";
|
||||
while ((s = br.readLine()) != null) {
|
||||
sb.append(s).append(" ");
|
||||
}
|
||||
|
||||
s = sb.toString();
|
||||
CharTermAttributeImpl currentTerm = new CharTermAttributeImpl();
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
if (checkIsCharSepartor(s, i)) {
|
||||
if (checkIsCharHasMeaning(s, i)) {
|
||||
terms.add(s.substring(i, i + 1));
|
||||
}
|
||||
String term = currentTerm.toString();
|
||||
currentTerm.clear();
|
||||
if (term.length() > 0) {
|
||||
terms.add(term);
|
||||
}
|
||||
} else {
|
||||
currentTerm.append(s.charAt(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean checkIsCharHasMeaning(String s, int i) {
|
||||
return MEANING_CHARS.contains(s.charAt(i));
|
||||
}
|
||||
|
||||
private boolean checkIsCharSepartor(String s, int i) {
|
||||
char c = s.charAt(i);
|
||||
if (SEPARATION_LETTERS.contains(c)) {
|
||||
return true;
|
||||
}
|
||||
if ('.' == c
|
||||
&& s.length() > i + 1
|
||||
&& SEPARATION_LETTERS.contains(s.charAt(i + 1))) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
this.terms = null;
|
||||
super.reset();
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,34 @@
|
||||
package org.apache.lucene.morphology.context;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* Created by akuznetsov on 6/24/15.
|
||||
*/
|
||||
public class StatAnalyzer extends Analyzer {
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String s) {
|
||||
|
||||
SimpleTokenizer src = new SimpleTokenizer();
|
||||
TokenFilter filter = new StandardFilter(src);
|
||||
filter = new LowerCaseFilter(filter);
|
||||
|
||||
return new TokenStreamComponents(src, filter) {
|
||||
@Override
|
||||
protected void setReader(final Reader reader) throws IOException {
|
||||
super.setReader(reader);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,32 @@
|
||||
package org.apache.lucene.morphology.context;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
public class SimpleTokenizerTest {
|
||||
|
||||
@Test
|
||||
public void testSimpleTokenizer() throws IOException {
|
||||
Analyzer statAnalyzer = new StatAnalyzer();
|
||||
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год? - и что я жду тебя, где вино".getBytes()), "UTF-8");
|
||||
|
||||
TokenStream tokenStream = statAnalyzer.tokenStream(null, reader);
|
||||
tokenStream.reset();
|
||||
|
||||
boolean wordSeen = false;
|
||||
while (tokenStream.incrementToken()) {
|
||||
CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
|
||||
System.out.println(charTerm.toString());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -3,26 +3,27 @@
|
||||
<parent>
|
||||
<artifactId>morphology</artifactId>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<version>1.5</version>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>dictionary-reader</artifactId>
|
||||
<name>dictionary-reader</name>
|
||||
<version>1.5</version>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
<url>http://maven.apache.org</url>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>russian</artifactId>
|
||||
<version>1.5</version>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>english</artifactId>
|
||||
<version>1.5</version>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
@ -22,19 +22,20 @@ import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
/**
|
||||
* This class contain logic how read
|
||||
* dictionary and produce word with it all forms.
|
||||
* dictonary and produce word with it all forms.
|
||||
*/
|
||||
public class DictionaryReader {
|
||||
private String fileName;
|
||||
private String fileEncoding = "windows-1251";
|
||||
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<>();
|
||||
private Set<String> ignoredForm;
|
||||
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
|
||||
private Set<String> ignoredForm = new HashSet<String>();
|
||||
|
||||
public DictionaryReader(String fileName, Set<String> ignoredForm) {
|
||||
this.fileName = fileName;
|
||||
@ -54,7 +55,7 @@ public class DictionaryReader {
|
||||
|
||||
private void readWords(BufferedReader reader, WordProcessor wordProcessor) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.parseInt(s);
|
||||
int count = Integer.valueOf(s);
|
||||
int actual = 0;
|
||||
for (int i = 0; i < count; i++) {
|
||||
s = reader.readLine();
|
||||
@ -78,7 +79,7 @@ public class DictionaryReader {
|
||||
String wordBase = wd[0].toLowerCase();
|
||||
if (wordBase.startsWith("-")) return null;
|
||||
wordBase = "#".equals(wordBase) ? "" : wordBase;
|
||||
List<FlexiaModel> models = wordsFlexias.get(Integer.parseInt(wd[1]));
|
||||
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
|
||||
FlexiaModel flexiaModel = models.get(0);
|
||||
if (models.size() == 0 || ignoredForm.contains(flexiaModel.getCode())) {
|
||||
return null;
|
||||
@ -95,7 +96,7 @@ public class DictionaryReader {
|
||||
|
||||
private void skipBlock(BufferedReader reader) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.parseInt(s);
|
||||
int count = Integer.valueOf(s);
|
||||
for (int i = 0; i < count; i++) {
|
||||
reader.readLine();
|
||||
}
|
||||
@ -104,7 +105,7 @@ public class DictionaryReader {
|
||||
|
||||
private void readPrefix(BufferedReader reader) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.parseInt(s);
|
||||
int count = Integer.valueOf(s);
|
||||
for (int i = 0; i < count; i++) {
|
||||
reader.readLine();
|
||||
}
|
||||
@ -112,10 +113,10 @@ public class DictionaryReader {
|
||||
|
||||
private void readFlexias(BufferedReader reader) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.parseInt(s);
|
||||
int count = Integer.valueOf(s);
|
||||
for (int i = 0; i < count; i++) {
|
||||
s = reader.readLine();
|
||||
ArrayList<FlexiaModel> flexiaModelArrayList = new ArrayList<>();
|
||||
ArrayList<FlexiaModel> flexiaModelArrayList = new ArrayList<FlexiaModel>();
|
||||
wordsFlexias.add(flexiaModelArrayList);
|
||||
for (String line : s.split("%")) {
|
||||
addFlexia(flexiaModelArrayList, line);
|
||||
|
@ -16,8 +16,6 @@
|
||||
|
||||
package org.apache.lucene.morphology.dictionary;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Represent information of how word form created form it imutible part.
|
||||
*/
|
||||
@ -76,9 +74,11 @@ public class FlexiaModel {
|
||||
|
||||
FlexiaModel that = (FlexiaModel) o;
|
||||
|
||||
if (!Objects.equals(code, that.code)) return false;
|
||||
if (!Objects.equals(prefix, that.prefix)) return false;
|
||||
return Objects.equals(suffix, that.suffix);
|
||||
if (code != null ? !code.equals(that.code) : that.code != null) return false;
|
||||
if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false;
|
||||
if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -29,8 +29,8 @@ import java.util.Map;
|
||||
public class GrammarReader {
|
||||
private String fileName;
|
||||
private String fileEncoding = "windows-1251";
|
||||
private List<String> grammarInfo = new ArrayList<>();
|
||||
private Map<String, Integer> inverseIndex = new HashMap<>();
|
||||
private List<String> grammarInfo = new ArrayList<String>();
|
||||
private Map<String, Integer> inverseIndex = new HashMap<String, Integer>();
|
||||
|
||||
public GrammarReader(String fileName) throws IOException {
|
||||
this.fileName = fileName;
|
||||
@ -50,7 +50,7 @@ public class GrammarReader {
|
||||
line = line.trim();
|
||||
if (!line.startsWith("//") && line.length() > 0) {
|
||||
String[] strings = line.split(" ", 2);
|
||||
int i = grammarInfo.size();
|
||||
Integer i = grammarInfo.size();
|
||||
inverseIndex.put(strings[0], i);
|
||||
grammarInfo.add(i, strings[1]);
|
||||
}
|
||||
@ -63,7 +63,7 @@ public class GrammarReader {
|
||||
}
|
||||
|
||||
public String[] getGrammarInfoAsArray() {
|
||||
return grammarInfo.toArray(new String[0]);
|
||||
return grammarInfo.toArray(new String[grammarInfo.size()]);
|
||||
}
|
||||
|
||||
public Map<String, Integer> getGrammarInverseIndex() {
|
||||
|
@ -15,7 +15,7 @@
|
||||
*/
|
||||
package org.apache.lucene.morphology.dictionary;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
@ -29,7 +29,7 @@ public class RemoveFlexiaWithPrefixes extends WordFilter {
|
||||
@Override
|
||||
public List<WordCard> transform(WordCard wordCard) {
|
||||
|
||||
List<FlexiaModel> flexiaModelsToRemove = new LinkedList<>();
|
||||
List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>();
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
if (fm.getPrefix().length() > 0) {
|
||||
flexiaModelsToRemove.add(fm);
|
||||
@ -39,6 +39,6 @@ public class RemoveFlexiaWithPrefixes extends WordFilter {
|
||||
wordCard.removeFlexia(fm);
|
||||
}
|
||||
|
||||
return new LinkedList<>(Collections.singletonList(wordCard));
|
||||
return new LinkedList<WordCard>(Arrays.asList(wordCard));
|
||||
}
|
||||
}
|
||||
|
@ -32,13 +32,13 @@ public class RussianAdvSplitterFilter extends WordFilter {
|
||||
|
||||
@Override
|
||||
public List<WordCard> transform(WordCard wordCard) {
|
||||
LinkedList<WordCard> result = new LinkedList<>();
|
||||
LinkedList<WordCard> result = new LinkedList<WordCard>();
|
||||
result.add(wordCard);
|
||||
|
||||
String baseWord = "";
|
||||
String canonicalForm = "";
|
||||
String canonicalSuffix = "";
|
||||
List<FlexiaModel> flexiaModels = new LinkedList<>();
|
||||
List<FlexiaModel> flexiaModels = new LinkedList<FlexiaModel>();
|
||||
for (FlexiaModel flexiaModel : wordCard.getWordsForms()) {
|
||||
if (flexiaModel.getPrefix().length() > 0) {
|
||||
flexiaModels.add(new FlexiaModel(flexiaModel.getCode(), flexiaModel.getSuffix(), ""));
|
||||
|
@ -27,9 +27,9 @@ import java.util.*;
|
||||
|
||||
//todo made refactoring this class
|
||||
public class StatisticsCollector implements WordProcessor {
|
||||
private TreeMap<String, Set<Heuristic>> inverseIndex = new TreeMap<>();
|
||||
private Map<Set<Heuristic>, Integer> ruleInverseIndex = new HashMap<>();
|
||||
private List<Set<Heuristic>> rules = new ArrayList<>();
|
||||
private TreeMap<String, Set<Heuristic>> inverseIndex = new TreeMap<String, Set<Heuristic>>();
|
||||
private Map<Set<Heuristic>, Integer> ruleInverseIndex = new HashMap<Set<Heuristic>, Integer>();
|
||||
private List<Set<Heuristic>> rules = new ArrayList<Set<Heuristic>>();
|
||||
private GrammarReader grammarReader;
|
||||
private LetterDecoderEncoder decoderEncoder;
|
||||
|
||||
@ -39,14 +39,18 @@ public class StatisticsCollector implements WordProcessor {
|
||||
this.decoderEncoder = decoderEncoder;
|
||||
}
|
||||
|
||||
public void process(WordCard wordCard) {
|
||||
public void process(WordCard wordCard) throws IOException {
|
||||
cleanWordCard(wordCard);
|
||||
String normalStringMorph = wordCard.getWordsForms().get(0).getCode();
|
||||
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph);
|
||||
String form = revertWord(fm.create(wordCard.getBase()));
|
||||
Set<Heuristic> suffixHeuristics = inverseIndex.computeIfAbsent(form, k -> new HashSet<>());
|
||||
Set<Heuristic> suffixHeuristics = inverseIndex.get(form);
|
||||
if (suffixHeuristics == null) {
|
||||
suffixHeuristics = new HashSet<Heuristic>();
|
||||
inverseIndex.put(form, suffixHeuristics);
|
||||
}
|
||||
suffixHeuristics.add(heuristic);
|
||||
}
|
||||
}
|
||||
@ -65,7 +69,7 @@ public class StatisticsCollector implements WordProcessor {
|
||||
|
||||
public void saveHeuristic(String fileName) throws IOException {
|
||||
|
||||
Map<Integer, Integer> dist = new TreeMap<>();
|
||||
Map<Integer, Integer> dist = new TreeMap<Integer, Integer>();
|
||||
Set<Heuristic> prevSet = null;
|
||||
int count = 0;
|
||||
for (String key : inverseIndex.keySet()) {
|
||||
@ -116,11 +120,11 @@ public class StatisticsCollector implements WordProcessor {
|
||||
}
|
||||
|
||||
private String revertWord(String s) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
String result = "";
|
||||
for (int i = 1; i <= s.length(); i++) {
|
||||
result.append(s.charAt(s.length() - i));
|
||||
result += s.charAt(s.length() - i);
|
||||
}
|
||||
return result.toString();
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@ -128,15 +132,15 @@ public class StatisticsCollector implements WordProcessor {
|
||||
String form = fm.create(wordBase);
|
||||
String normalForm = wordBase + canonicalSuffix;
|
||||
Integer length = getCommonLength(form, normalForm);
|
||||
int actualSuffixLengh = form.length() - length;
|
||||
Integer actualSuffixLengh = form.length() - length;
|
||||
String actualNormalSuffix = normalForm.substring(length);
|
||||
Integer integer = grammarReader.getGrammarInverseIndex().get(fm.getCode());
|
||||
Integer nf = grammarReader.getGrammarInverseIndex().get(normalSuffixForm);
|
||||
return new Heuristic((byte) actualSuffixLengh, actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue());
|
||||
return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue());
|
||||
}
|
||||
|
||||
public static Integer getCommonLength(String s1, String s2) {
|
||||
int length = Math.min(s1.length(), s2.length());
|
||||
Integer length = Math.min(s1.length(), s2.length());
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (s1.charAt(i) != s2.charAt(i)) return i;
|
||||
}
|
||||
|
@ -26,7 +26,7 @@ public class WordCard {
|
||||
private String canonicalForm;
|
||||
private String base;
|
||||
private String canonicalSuffix;
|
||||
private List<FlexiaModel> wordsForms = new ArrayList<>();
|
||||
private List<FlexiaModel> wordsForms = new ArrayList<FlexiaModel>();
|
||||
|
||||
public WordCard(String canonicalForm, String base, String canonicalSuffix) {
|
||||
this.canonicalForm = canonicalForm;
|
||||
|
@ -17,6 +17,7 @@ package org.apache.lucene.morphology.dictionary;
|
||||
|
||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
@ -37,7 +38,7 @@ public class WordCleaner extends WordFilter {
|
||||
if (word.contains("-")) return Collections.emptyList();
|
||||
if (!decoderEncoder.checkString(word)) return Collections.emptyList();
|
||||
|
||||
List<FlexiaModel> flexiaModelsToRemove = new LinkedList<>();
|
||||
List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>();
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) {
|
||||
flexiaModelsToRemove.add(fm);
|
||||
@ -47,6 +48,6 @@ public class WordCleaner extends WordFilter {
|
||||
wordCard.removeFlexia(fm);
|
||||
}
|
||||
|
||||
return new LinkedList<>(Collections.singletonList(wordCard));
|
||||
return new LinkedList<WordCard>(Arrays.asList(wordCard));
|
||||
}
|
||||
}
|
||||
|
@ -23,5 +23,5 @@ import java.io.IOException;
|
||||
*/
|
||||
public interface WordProcessor {
|
||||
|
||||
void process(WordCard wordCard) throws IOException;
|
||||
public void process(WordCard wordCard) throws IOException;
|
||||
}
|
||||
|
@ -17,7 +17,7 @@ package org.apache.lucene.morphology.dictionary;
|
||||
|
||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
@ -42,7 +42,7 @@ public class WordStringCleaner extends WordFilter {
|
||||
//made correct code
|
||||
m.setCode(m.getCode().substring(0, 2));
|
||||
}
|
||||
return new LinkedList<>(Collections.singletonList(wordCard));
|
||||
return new LinkedList<WordCard>(Arrays.asList(wordCard));
|
||||
}
|
||||
|
||||
|
||||
|
@ -29,7 +29,7 @@ public class EnglishHeuristicBuilder {
|
||||
GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/egramtab.tab");
|
||||
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
||||
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<>());
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());
|
||||
|
||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder);
|
||||
WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector);
|
||||
@ -39,4 +39,4 @@ public class EnglishHeuristicBuilder {
|
||||
statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
|
||||
|
||||
}
|
||||
}
|
||||
}
|
@ -28,7 +28,7 @@ public class RussianHeuristicBuilder {
|
||||
GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/rgramtab.tab");
|
||||
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
||||
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<>());
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
|
||||
|
||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder);
|
||||
WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector);
|
||||
|
@ -23,7 +23,6 @@ import org.apache.lucene.morphology.english.EnglishMorphology;
|
||||
import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
|
||||
import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
|
||||
import org.apache.lucene.morphology.russian.RussianMorphology;
|
||||
import org.hamcrest.MatcherAssert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
@ -34,6 +33,7 @@ import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
import static org.hamcrest.Matchers.hasItem;
|
||||
import static org.junit.Assert.assertThat;
|
||||
|
||||
|
||||
public class TestAllWords {
|
||||
@ -73,19 +73,21 @@ public class TestAllWords {
|
||||
final List<String> morphInfo = grammarInfo.getGrammarInfo();
|
||||
final Map<String, Integer> inversIndex = grammarInfo.getGrammarInverseIndex();
|
||||
|
||||
DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet<>());
|
||||
DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet<String>());
|
||||
|
||||
final AtomicLong wordCount = new AtomicLong(0);
|
||||
long startTime = System.currentTimeMillis();
|
||||
Long startTime = System.currentTimeMillis();
|
||||
|
||||
WordProcessor wordProcessor = wordCard -> {
|
||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
String wordForm = wordCard.getBase() + fm.getSuffix();
|
||||
String morph = morphInfo.get(inversIndex.get(fm.getCode()));
|
||||
MatcherAssert.assertThat(morphology.getMorphInfo(wordForm), hasItem(word + "|" + morph));
|
||||
MatcherAssert.assertThat(morphology.getNormalForms(wordForm), hasItem(word));
|
||||
wordCount.set(2L + wordCount.get());
|
||||
WordProcessor wordProcessor = new WordProcessor() {
|
||||
public void process(WordCard wordCard) throws IOException {
|
||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
String wordForm = wordCard.getBase() + fm.getSuffix();
|
||||
String morph = morphInfo.get(inversIndex.get(fm.getCode()));
|
||||
assertThat(morphology.getMorphInfo(wordForm), hasItem(word + "|" + morph));
|
||||
assertThat(morphology.getNormalForms(wordForm), hasItem(word));
|
||||
wordCount.set(2L + wordCount.get());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@ -121,15 +123,17 @@ public class TestAllWords {
|
||||
|
||||
private void testAllWordForLucene(final LuceneMorphology morphology, LetterDecoderEncoder decoderEncoder, String pathToDic) throws IOException {
|
||||
final AtomicLong wordCount = new AtomicLong(0);
|
||||
long startTime = System.currentTimeMillis();
|
||||
Long startTime = System.currentTimeMillis();
|
||||
|
||||
DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<>());
|
||||
WordProcessor wordProcessor = wordCard -> {
|
||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
String wordForm = wordCard.getBase() + fm.getSuffix();
|
||||
MatcherAssert.assertThat(morphology.getNormalForms(wordForm), hasItem(word));
|
||||
wordCount.set(1L + wordCount.get());
|
||||
DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<String>());
|
||||
WordProcessor wordProcessor = new WordProcessor() {
|
||||
public void process(WordCard wordCard) throws IOException {
|
||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
String wordForm = wordCard.getBase() + fm.getSuffix();
|
||||
assertThat(morphology.getNormalForms(wordForm), hasItem(word));
|
||||
wordCount.set(1L + wordCount.get());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -16,12 +16,6 @@
|
||||
package org.apache.lucene.morphology;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer;
|
||||
@ -31,17 +25,16 @@ import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.morphology.english.EnglishLuceneMorphology;
|
||||
import org.apache.lucene.morphology.russian.RussianAnalyzer;
|
||||
import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
|
||||
import org.hamcrest.MatcherAssert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.*;
|
||||
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.junit.Assert.assertThat;
|
||||
|
||||
|
||||
public class TestAnalyzers extends BaseTokenStreamTestCase {
|
||||
public class AnalyzersTest {
|
||||
|
||||
@Test
|
||||
public void shouldGiveCorrectWordsForEnglish() throws IOException {
|
||||
@ -67,24 +60,24 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
|
||||
LuceneMorphology englishLuceneMorphology = new EnglishLuceneMorphology();
|
||||
|
||||
MorphologyAnalyzer russianAnalyzer = new MorphologyAnalyzer(russianLuceneMorphology);
|
||||
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("тест пм тест".getBytes()), StandardCharsets.UTF_8);
|
||||
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("тест пм тест".getBytes()), "UTF-8");
|
||||
TokenStream stream = russianAnalyzer.tokenStream(null, reader);
|
||||
MorphologyFilter englishFilter = new MorphologyFilter(stream, englishLuceneMorphology);
|
||||
|
||||
englishFilter.reset();
|
||||
while (englishFilter.incrementToken()) {
|
||||
System.out.println(englishFilter);
|
||||
System.out.println(englishFilter.toString());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldProvideCorrectIndentForWordWithMelitaForm() throws IOException {
|
||||
Analyzer morphlogyAnalyzer = new RussianAnalyzer();
|
||||
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), StandardCharsets.UTF_8);
|
||||
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), "UTF-8");
|
||||
|
||||
TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
|
||||
tokenStream.reset();
|
||||
Set<String> foromsOfWine = new HashSet<>();
|
||||
Set<String> foromsOfWine = new HashSet<String>();
|
||||
foromsOfWine.add("вина");
|
||||
foromsOfWine.add("винo");
|
||||
boolean wordSeen = false;
|
||||
@ -92,7 +85,7 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
|
||||
CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
|
||||
if(foromsOfWine.contains(charTerm.toString()) && wordSeen){
|
||||
MatcherAssert.assertThat(position.getPositionIncrement(),equalTo(0));
|
||||
assertThat(position.getPositionIncrement(),equalTo(0));
|
||||
}
|
||||
if(foromsOfWine.contains(charTerm.toString())){
|
||||
wordSeen = true;
|
||||
@ -102,18 +95,18 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
|
||||
|
||||
private void testAnalayzer(Analyzer morphlogyAnalyzer, String answerPath, String testPath) throws IOException {
|
||||
InputStream stream = this.getClass().getResourceAsStream(answerPath);
|
||||
BufferedReader breader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
|
||||
BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||
String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
|
||||
HashSet<String> answer = new HashSet<>(Arrays.asList(strings));
|
||||
HashSet<String> answer = new HashSet<String>(Arrays.asList(strings));
|
||||
stream.close();
|
||||
|
||||
stream = this.getClass().getResourceAsStream(testPath);
|
||||
|
||||
InputStreamReader reader = new InputStreamReader(stream, StandardCharsets.UTF_8);
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
|
||||
TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
|
||||
tokenStream.reset();
|
||||
HashSet<String> result = new HashSet<>();
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
while (tokenStream.incrementToken()) {
|
||||
CharTermAttribute attribute1 = tokenStream.getAttribute(CharTermAttribute.class);
|
||||
result.add(attribute1.toString());
|
||||
@ -121,45 +114,6 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
|
||||
|
||||
stream.close();
|
||||
|
||||
MatcherAssert.assertThat(result, equalTo(answer));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPositionIncrement() throws IOException {
|
||||
EnglishAnalyzer englishAnalyzer = new EnglishAnalyzer();
|
||||
assertTokenStreamContents(
|
||||
englishAnalyzer.tokenStream("test", "There are tests!"),
|
||||
new String[]{"there", "are", "be", "test"},
|
||||
new int[]{0, 6, 6, 10},
|
||||
new int[]{5, 9, 9, 15},
|
||||
new String[]{"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>"},
|
||||
new int[]{1, 1, 0, 1}
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testKeywordHandling() throws IOException {
|
||||
Analyzer analyzer = new EnglishKeywordTestAnalyzer();
|
||||
assertTokenStreamContents(
|
||||
analyzer.tokenStream("test", "Tests shouldn't be stemmed, but tests should!"),
|
||||
new String[]{"tests", "shouldn't", "be", "stem", "but", "test", "shall"}
|
||||
);
|
||||
}
|
||||
|
||||
private static class EnglishKeywordTestAnalyzer extends Analyzer {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String s) {
|
||||
StandardTokenizer src = new StandardTokenizer();
|
||||
CharArraySet dontStem = new CharArraySet(1, false);
|
||||
dontStem.add("Tests");
|
||||
TokenFilter filter = new SetKeywordMarkerFilter(src, dontStem);
|
||||
filter = new LowerCaseFilter(filter);
|
||||
try {
|
||||
filter = new MorphologyFilter(filter, new EnglishLuceneMorphology());
|
||||
} catch (IOException ex) {
|
||||
throw new RuntimeException("cannot create EnglishLuceneMorphology", ex);
|
||||
}
|
||||
return new TokenStreamComponents(src, filter);
|
||||
}
|
||||
assertThat(result, equalTo(answer));
|
||||
}
|
||||
}
|
@ -17,23 +17,22 @@ package org.apache.lucene.morphology;
|
||||
|
||||
import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
|
||||
import org.apache.lucene.morphology.english.EnglishLuceneMorphology;
|
||||
import org.hamcrest.MatcherAssert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import static org.hamcrest.CoreMatchers.equalTo;
|
||||
import static org.junit.Assert.assertThat;
|
||||
|
||||
|
||||
public class TestLuceneMorph {
|
||||
public class LuceneMorphTest {
|
||||
|
||||
@Test
|
||||
public void englishMorphologyShouldGetCorrectNormalForm() throws IOException {
|
||||
@ -53,13 +52,14 @@ public class TestLuceneMorph {
|
||||
|
||||
private void testMorphology(LuceneMorphology luceneMorph, String pathToTestData) throws IOException {
|
||||
InputStream stream = this.getClass().getResourceAsStream(pathToTestData);
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||
String s = bufferedReader.readLine();
|
||||
while (s != null) {
|
||||
String[] qa = s.trim().split(" ");
|
||||
Set<String> result = new HashSet<>(Arrays.asList(qa).subList(1, qa.length));
|
||||
Set<String> stringList = new HashSet<>(luceneMorph.getNormalForms(qa[0]));
|
||||
MatcherAssert.assertThat(stringList, equalTo(result));
|
||||
Set<String> result = new HashSet<String>();
|
||||
result.addAll(Arrays.asList(qa).subList(1, qa.length));
|
||||
Set<String> stringList = new HashSet<String>(luceneMorph.getNormalForms(qa[0]));
|
||||
assertThat(stringList, equalTo(result));
|
||||
s = bufferedReader.readLine();
|
||||
}
|
||||
}
|
@ -3,20 +3,27 @@
|
||||
<parent>
|
||||
<artifactId>morphology</artifactId>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<version>1.5</version>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>english</artifactId>
|
||||
<name>english</name>
|
||||
<version>1.5</version>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
<url>http://maven.apache.org</url>
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>morph</artifactId>
|
||||
<version>1.5</version>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.8.2</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
</project>
|
@ -32,7 +32,7 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder {
|
||||
if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12);
|
||||
int result = 0;
|
||||
for (int i = 0; i < string.length(); i++) {
|
||||
int c = string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET;
|
||||
int c = 0 + string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET;
|
||||
if (c == 45 - ENGLISH_SMALL_LETTER_OFFSET) {
|
||||
c = DASH_CODE;
|
||||
}
|
||||
@ -48,7 +48,7 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder {
|
||||
|
||||
public int[] encodeToArray(String s) {
|
||||
|
||||
ArrayList<Integer> integers = new ArrayList<>();
|
||||
ArrayList<Integer> integers = new ArrayList<Integer>();
|
||||
while (s.length() > 6) {
|
||||
integers.add(encode(s.substring(0, 6)));
|
||||
s = s.substring(6);
|
||||
@ -64,16 +64,16 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder {
|
||||
}
|
||||
|
||||
public String decodeArray(int[] array) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
String result = "";
|
||||
for (int i : array) {
|
||||
result.append(decode(i));
|
||||
result += decode(i);
|
||||
}
|
||||
return result.toString();
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
public String decode(Integer suffixN) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
String result = "";
|
||||
while (suffixN > 27) {
|
||||
int c = suffixN % 28 + ENGLISH_SMALL_LETTER_OFFSET;
|
||||
if (c == ENGLISH_SMALL_LETTER_OFFSET) {
|
||||
@ -81,20 +81,21 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder {
|
||||
continue;
|
||||
}
|
||||
if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
|
||||
result.insert(0, (char) c);
|
||||
result = (char) c + result;
|
||||
suffixN /= 28;
|
||||
}
|
||||
long c = suffixN + ENGLISH_SMALL_LETTER_OFFSET;
|
||||
if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
|
||||
result.insert(0, (char) c);
|
||||
return result.toString();
|
||||
result = (char) c + result;
|
||||
return result;
|
||||
}
|
||||
|
||||
public boolean checkCharacter(char c) {
|
||||
int code = c;
|
||||
int code = 0 + c;
|
||||
if (code == 45) return true;
|
||||
code -= ENGLISH_SMALL_LETTER_OFFSET;
|
||||
return code > 0 && code < 27;
|
||||
if (code > 0 && code < 27) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
@ -16,8 +16,7 @@
|
||||
package org.apache.lucene.morphology.english;
|
||||
|
||||
import static org.hamcrest.core.IsEqual.equalTo;
|
||||
|
||||
import org.hamcrest.MatcherAssert;
|
||||
import static org.junit.Assert.assertThat;
|
||||
import org.junit.Before;
|
||||
|
||||
|
||||
@ -31,11 +30,11 @@ public class EnglishLetterDecoderEncoderTest {
|
||||
|
||||
@org.junit.Test
|
||||
public void testDecodeEncodeToArray() {
|
||||
MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("abcdefghijklmnopqrstuvwxyz")), equalTo("abcdefghijklmnopqrstuvwxyz"));
|
||||
MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("xyz")), equalTo("xyz"));
|
||||
MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrty")), equalTo("ytrrty"));
|
||||
MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyz")), equalTo("ytrrtyz"));
|
||||
MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyzqwqwe")), equalTo("ytrrtyzqwqwe"));
|
||||
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("abcdefghijklmnopqrstuvwxyz")), equalTo("abcdefghijklmnopqrstuvwxyz"));
|
||||
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("xyz")), equalTo("xyz"));
|
||||
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrty")), equalTo("ytrrty"));
|
||||
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyz")), equalTo("ytrrtyz"));
|
||||
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyzqwqwe")), equalTo("ytrrtyzqwqwe"));
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -16,9 +16,9 @@
|
||||
package org.apache.lucene.morphology.english.stemmer;
|
||||
|
||||
import org.apache.lucene.morphology.english.EnglishLuceneMorphology;
|
||||
import org.hamcrest.MatcherAssert;
|
||||
import org.junit.Test;
|
||||
import static org.hamcrest.core.IsEqual.equalTo;
|
||||
import static org.junit.Assert.assertThat;
|
||||
|
||||
|
||||
public class EnglishStemmerTest {
|
||||
@ -26,24 +26,24 @@ public class EnglishStemmerTest {
|
||||
public void testGetStemmedWord() throws Exception {
|
||||
EnglishLuceneMorphology englishLuceneMorphology = new EnglishLuceneMorphology();
|
||||
EnglishStemmer englishStemmer = new EnglishStemmer(englishLuceneMorphology);
|
||||
MatcherAssert.assertThat(englishStemmer.getStemmedWord("running"),equalTo("run"));
|
||||
MatcherAssert.assertThat(englishStemmer.getStemmedWord("run"),equalTo("run"));
|
||||
MatcherAssert.assertThat(englishStemmer.getStemmedWord("killed"),equalTo("kill"));
|
||||
MatcherAssert.assertThat(englishStemmer.getStemmedWord("kill"),equalTo("kill"));
|
||||
MatcherAssert.assertThat(englishStemmer.getStemmedWord("networking"),equalTo("network"));
|
||||
MatcherAssert.assertThat(englishStemmer.getStemmedWord("network"),equalTo("network"));
|
||||
MatcherAssert.assertThat(englishStemmer.getStemmedWord("statistics"),equalTo("statistic"));
|
||||
MatcherAssert.assertThat(englishStemmer.getStemmedWord("statistic"),equalTo("statistic"));
|
||||
MatcherAssert.assertThat(englishStemmer.getStemmedWord("stats"),equalTo("stat"));
|
||||
MatcherAssert.assertThat(englishStemmer.getStemmedWord("stat"),equalTo("stat"));
|
||||
MatcherAssert.assertThat(englishStemmer.getStemmedWord("countries"),equalTo("country"));
|
||||
MatcherAssert.assertThat(englishStemmer.getStemmedWord("country"),equalTo("country"));
|
||||
MatcherAssert.assertThat(englishStemmer.getStemmedWord("delete"),equalTo("delete"));
|
||||
MatcherAssert.assertThat(englishStemmer.getStemmedWord("ended"),equalTo("end"));
|
||||
MatcherAssert.assertThat(englishStemmer.getStemmedWord("end"),equalTo("end"));
|
||||
MatcherAssert.assertThat(englishStemmer.getStemmedWord("ends"),equalTo("end"));
|
||||
MatcherAssert.assertThat(englishStemmer.getStemmedWord("given"),equalTo("give"));
|
||||
MatcherAssert.assertThat(englishStemmer.getStemmedWord("give"),equalTo("give"));
|
||||
MatcherAssert.assertThat(englishStemmer.getStemmedWord("log4j"),equalTo("log4j"));
|
||||
assertThat(englishStemmer.getStemmedWord("running"),equalTo("run"));
|
||||
assertThat(englishStemmer.getStemmedWord("run"),equalTo("run"));
|
||||
assertThat(englishStemmer.getStemmedWord("killed"),equalTo("kill"));
|
||||
assertThat(englishStemmer.getStemmedWord("kill"),equalTo("kill"));
|
||||
assertThat(englishStemmer.getStemmedWord("networking"),equalTo("network"));
|
||||
assertThat(englishStemmer.getStemmedWord("network"),equalTo("network"));
|
||||
assertThat(englishStemmer.getStemmedWord("statistics"),equalTo("statistic"));
|
||||
assertThat(englishStemmer.getStemmedWord("statistic"),equalTo("statistic"));
|
||||
assertThat(englishStemmer.getStemmedWord("stats"),equalTo("stat"));
|
||||
assertThat(englishStemmer.getStemmedWord("stat"),equalTo("stat"));
|
||||
assertThat(englishStemmer.getStemmedWord("countries"),equalTo("country"));
|
||||
assertThat(englishStemmer.getStemmedWord("country"),equalTo("country"));
|
||||
assertThat(englishStemmer.getStemmedWord("delete"),equalTo("delete"));
|
||||
assertThat(englishStemmer.getStemmedWord("ended"),equalTo("end"));
|
||||
assertThat(englishStemmer.getStemmedWord("end"),equalTo("end"));
|
||||
assertThat(englishStemmer.getStemmedWord("ends"),equalTo("end"));
|
||||
assertThat(englishStemmer.getStemmedWord("given"),equalTo("give"));
|
||||
assertThat(englishStemmer.getStemmedWord("give"),equalTo("give"));
|
||||
assertThat(englishStemmer.getStemmedWord("log4j"),equalTo("log4j"));
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
Copyright 2009 Alexander Kuznetsov
|
||||
Copyright ${project.inceptionYear} ${owner}
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
|
@ -3,12 +3,13 @@
|
||||
<parent>
|
||||
<artifactId>morphology</artifactId>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<version>1.5</version>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>morph</artifactId>
|
||||
<name>morph</name>
|
||||
<version>1.5</version>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
<url>http://maven.apache.org</url>
|
||||
|
||||
</project>
|
||||
|
@ -21,7 +21,7 @@ import java.util.ArrayList;
|
||||
|
||||
public abstract class BaseLetterDecoderEncoder implements LetterDecoderEncoder {
|
||||
public int[] encodeToArray(String s) {
|
||||
ArrayList<Integer> integers = new ArrayList<>();
|
||||
ArrayList<Integer> integers = new ArrayList<Integer>();
|
||||
while (s.length() > 6) {
|
||||
integers.add(encode(s.substring(0, 6)));
|
||||
s = s.substring(6);
|
||||
@ -37,11 +37,11 @@ public abstract class BaseLetterDecoderEncoder implements LetterDecoderEncoder {
|
||||
}
|
||||
|
||||
public String decodeArray(int[] array) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
String result = "";
|
||||
for (int i : array) {
|
||||
result.append(decode(i));
|
||||
result += decode(i);
|
||||
}
|
||||
return result.toString();
|
||||
return result;
|
||||
}
|
||||
|
||||
public boolean checkString(String word) {
|
||||
|
@ -16,7 +16,6 @@
|
||||
package org.apache.lucene.morphology;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
|
||||
|
||||
public class Heuristic implements Serializable {
|
||||
@ -27,10 +26,10 @@ public class Heuristic implements Serializable {
|
||||
|
||||
public Heuristic(String s) {
|
||||
String[] strings = s.split("\\|");
|
||||
actualSuffixLength = Byte.parseByte(strings[0]);
|
||||
actualSuffixLength = Byte.valueOf(strings[0]);
|
||||
actualNormalSuffix = strings[1];
|
||||
formMorphInfo = Short.parseShort(strings[2]);
|
||||
normalFormMorphInfo = Short.parseShort(strings[3]);
|
||||
formMorphInfo = Short.valueOf(strings[2]);
|
||||
normalFormMorphInfo = Short.valueOf(strings[3]);
|
||||
}
|
||||
|
||||
public Heuristic(byte actualSuffixLength, String actualNormalSuffix, short formMorphInfo, short normalFormMorphInfo) {
|
||||
@ -71,12 +70,15 @@ public class Heuristic implements Serializable {
|
||||
if (actualSuffixLength != heuristic.actualSuffixLength) return false;
|
||||
if (formMorphInfo != heuristic.formMorphInfo) return false;
|
||||
if (normalFormMorphInfo != heuristic.normalFormMorphInfo) return false;
|
||||
return Objects.equals(actualNormalSuffix, heuristic.actualNormalSuffix);
|
||||
if (actualNormalSuffix != null ? !actualNormalSuffix.equals(heuristic.actualNormalSuffix) : heuristic.actualNormalSuffix != null)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = actualSuffixLength;
|
||||
int result = (int) actualSuffixLength;
|
||||
result = 31 * result + (actualNormalSuffix != null ? actualNormalSuffix.hashCode() : 0);
|
||||
result = 31 * result + (int) formMorphInfo;
|
||||
result = 31 * result + (int) normalFormMorphInfo;
|
||||
|
@ -17,17 +17,17 @@ package org.apache.lucene.morphology;
|
||||
|
||||
|
||||
public interface LetterDecoderEncoder {
|
||||
Integer encode(String string);
|
||||
public Integer encode(String string);
|
||||
|
||||
int[] encodeToArray(String s);
|
||||
public int[] encodeToArray(String s);
|
||||
|
||||
String decodeArray(int[] array);
|
||||
public String decodeArray(int[] array);
|
||||
|
||||
String decode(Integer suffixN);
|
||||
public String decode(Integer suffixN);
|
||||
|
||||
boolean checkCharacter(char c);
|
||||
public boolean checkCharacter(char c);
|
||||
|
||||
boolean checkString(String word);
|
||||
public boolean checkString(String word);
|
||||
|
||||
String cleanString(String s);
|
||||
public String cleanString(String s);
|
||||
}
|
||||
|
@ -34,13 +34,13 @@ public class LuceneMorphology extends MorphologyImpl {
|
||||
|
||||
protected void readRules(BufferedReader bufferedReader) throws IOException {
|
||||
String s;
|
||||
int amount;
|
||||
Integer amount;
|
||||
s = bufferedReader.readLine();
|
||||
amount = Integer.parseInt(s);
|
||||
amount = Integer.valueOf(s);
|
||||
rules = new Heuristic[amount][];
|
||||
for (int i = 0; i < amount; i++) {
|
||||
String s1 = bufferedReader.readLine();
|
||||
int ruleLenght = Integer.parseInt(s1);
|
||||
Integer ruleLenght = Integer.valueOf(s1);
|
||||
Heuristic[] heuristics = new Heuristic[ruleLenght];
|
||||
for (int j = 0; j < ruleLenght; j++) {
|
||||
heuristics[j] = new Heuristic(bufferedReader.readLine());
|
||||
@ -51,7 +51,7 @@ public class LuceneMorphology extends MorphologyImpl {
|
||||
|
||||
|
||||
private Heuristic[] modeifyHeuristic(Heuristic[] heuristics) {
|
||||
ArrayList<Heuristic> result = new ArrayList<>();
|
||||
ArrayList<Heuristic> result = new ArrayList<Heuristic>();
|
||||
for (Heuristic heuristic : heuristics) {
|
||||
boolean isAdded = true;
|
||||
for (Heuristic ch : result) {
|
||||
@ -61,7 +61,7 @@ public class LuceneMorphology extends MorphologyImpl {
|
||||
result.add(heuristic);
|
||||
}
|
||||
}
|
||||
return result.toArray(new Heuristic[0]);
|
||||
return result.toArray(new Heuristic[result.size()]);
|
||||
}
|
||||
|
||||
public boolean checkString(String s) {
|
||||
|
@ -17,7 +17,6 @@ package org.apache.lucene.morphology;
|
||||
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@ -48,7 +47,7 @@ public class MorphologyImpl implements Morphology {
|
||||
}
|
||||
|
||||
public List<String> getNormalForms(String s) {
|
||||
ArrayList<String> result = new ArrayList<>();
|
||||
ArrayList<String> result = new ArrayList<String>();
|
||||
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
|
||||
int ruleId = findRuleId(ints);
|
||||
boolean notSeenEmptyString = true;
|
||||
@ -65,7 +64,7 @@ public class MorphologyImpl implements Morphology {
|
||||
}
|
||||
|
||||
public List<String> getMorphInfo(String s) {
|
||||
ArrayList<String> result = new ArrayList<>();
|
||||
ArrayList<String> result = new ArrayList<String>();
|
||||
int[] ints = decoderEncoder.encodeToArray(revertWord(s));
|
||||
int ruleId = findRuleId(ints);
|
||||
for (Heuristic h : rules[rulesId[ruleId]]) {
|
||||
@ -101,14 +100,14 @@ public class MorphologyImpl implements Morphology {
|
||||
private int compareToInts(int[] i1, int[] i2) {
|
||||
int minLength = Math.min(i1.length, i2.length);
|
||||
for (int i = 0; i < minLength; i++) {
|
||||
int i3 = Integer.compare(i1[i], i2[i]);
|
||||
int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1);
|
||||
if (i3 != 0) return i3;
|
||||
}
|
||||
return i1.length - i2.length;
|
||||
}
|
||||
|
||||
public void writeToFile(String fileName) throws IOException {
|
||||
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8);
|
||||
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8");
|
||||
writer.write(separators.length + "\n");
|
||||
for (int[] i : separators) {
|
||||
writer.write(i.length + "\n");
|
||||
@ -139,7 +138,7 @@ public class MorphologyImpl implements Morphology {
|
||||
}
|
||||
|
||||
private void readFromInputStream(InputStream inputStream) throws IOException {
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8));
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
|
||||
String s = bufferedReader.readLine();
|
||||
Integer amount = Integer.valueOf(s);
|
||||
|
||||
@ -154,9 +153,9 @@ public class MorphologyImpl implements Morphology {
|
||||
|
||||
private void readGrammaInfo(BufferedReader bufferedReader) throws IOException {
|
||||
String s;
|
||||
int amount;
|
||||
Integer amount;
|
||||
s = bufferedReader.readLine();
|
||||
amount = Integer.parseInt(s);
|
||||
amount = Integer.valueOf(s);
|
||||
grammarInfo = new String[amount];
|
||||
for (int i = 0; i < amount; i++) {
|
||||
grammarInfo[i] = bufferedReader.readLine();
|
||||
@ -165,13 +164,13 @@ public class MorphologyImpl implements Morphology {
|
||||
|
||||
protected void readRules(BufferedReader bufferedReader) throws IOException {
|
||||
String s;
|
||||
int amount;
|
||||
Integer amount;
|
||||
s = bufferedReader.readLine();
|
||||
amount = Integer.parseInt(s);
|
||||
amount = Integer.valueOf(s);
|
||||
rules = new Heuristic[amount][];
|
||||
for (int i = 0; i < amount; i++) {
|
||||
String s1 = bufferedReader.readLine();
|
||||
int ruleLength = Integer.parseInt(s1);
|
||||
Integer ruleLength = Integer.valueOf(s1);
|
||||
rules[i] = new Heuristic[ruleLength];
|
||||
for (int j = 0; j < ruleLength; j++) {
|
||||
rules[i][j] = new Heuristic(bufferedReader.readLine());
|
||||
@ -183,7 +182,7 @@ public class MorphologyImpl implements Morphology {
|
||||
rulesId = new short[amount];
|
||||
for (int i = 0; i < amount; i++) {
|
||||
String s1 = bufferedReader.readLine();
|
||||
rulesId[i] = Short.parseShort(s1);
|
||||
rulesId[i] = Short.valueOf(s1);
|
||||
}
|
||||
}
|
||||
|
||||
@ -191,10 +190,10 @@ public class MorphologyImpl implements Morphology {
|
||||
separators = new int[amount][];
|
||||
for (int i = 0; i < amount; i++) {
|
||||
String s1 = bufferedReader.readLine();
|
||||
int wordLenght = Integer.parseInt(s1);
|
||||
Integer wordLenght = Integer.valueOf(s1);
|
||||
separators[i] = new int[wordLenght];
|
||||
for (int j = 0; j < wordLenght; j++) {
|
||||
separators[i][j] = Integer.parseInt(bufferedReader.readLine());
|
||||
separators[i][j] = Integer.valueOf(bufferedReader.readLine());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -17,10 +17,11 @@
|
||||
package org.apache.lucene.morphology.analyzer;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.payloads.PayloadEncoder;
|
||||
import org.apache.lucene.analysis.payloads.PayloadHelper;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||
import org.apache.lucene.morphology.LuceneMorphology;
|
||||
@ -28,7 +29,7 @@ import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
public class MorphologyAnalyzer extends Analyzer {
|
||||
private LuceneMorphology luceneMorph;
|
||||
@ -50,29 +51,17 @@ public class MorphologyAnalyzer extends Analyzer {
|
||||
protected TokenStreamComponents createComponents(String s) {
|
||||
|
||||
StandardTokenizer src = new StandardTokenizer();
|
||||
final PayloadEncoder encoder = new PayloadEncoder() {
|
||||
@Override
|
||||
public BytesRef encode(char[] buffer) {
|
||||
final Float payload = Float.valueOf(new String(buffer));
|
||||
System.out.println(payload);
|
||||
final byte[] bytes = PayloadHelper.encodeFloat(payload);
|
||||
return new BytesRef(bytes, 0, bytes.length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef encode(char[] buffer, int offset, int length) {
|
||||
|
||||
final Float payload = Float.valueOf(new String(buffer, offset, length));
|
||||
System.out.println(payload);
|
||||
final byte[] bytes = PayloadHelper.encodeFloat(payload);
|
||||
|
||||
return new BytesRef(bytes, 0, bytes.length);
|
||||
}
|
||||
};
|
||||
|
||||
TokenFilter filter = new LowerCaseFilter(src);
|
||||
TokenFilter filter = new StandardFilter(src);
|
||||
filter = new LowerCaseFilter(filter);
|
||||
filter = new MorphologyFilter(filter, luceneMorph);
|
||||
|
||||
return new TokenStreamComponents(src::setReader, filter);
|
||||
return new TokenStreamComponents(src, filter) {
|
||||
@Override
|
||||
protected void setReader(final Reader reader) throws IOException {
|
||||
super.setReader(reader);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -19,22 +19,18 @@ package org.apache.lucene.morphology.analyzer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.morphology.LuceneMorphology;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class MorphologyFilter extends TokenFilter {
|
||||
private LuceneMorphology luceneMorph;
|
||||
private Iterator<String> iterator;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
private final PositionIncrementAttribute position = addAttribute(PositionIncrementAttribute.class);
|
||||
private State state = null;
|
||||
|
||||
public MorphologyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
|
||||
super(tokenStream);
|
||||
@ -43,45 +39,27 @@ public class MorphologyFilter extends TokenFilter {
|
||||
|
||||
|
||||
final public boolean incrementToken() throws IOException {
|
||||
if (iterator != null) {
|
||||
if (iterator.hasNext()) {
|
||||
restoreState(state);
|
||||
position.setPositionIncrement(0);
|
||||
termAtt.setEmpty().append(iterator.next());
|
||||
return true;
|
||||
} else {
|
||||
state = null;
|
||||
iterator = null;
|
||||
}
|
||||
}
|
||||
while (true) {
|
||||
boolean oldToken = true;
|
||||
while (iterator == null || !iterator.hasNext()) {
|
||||
boolean b = input.incrementToken();
|
||||
if (!b) {
|
||||
return false;
|
||||
}
|
||||
if (!keywordAttr.isKeyword() && termAtt.length() > 0) {
|
||||
String s = new String(termAtt.buffer(), 0, termAtt.length());
|
||||
if (luceneMorph.checkString(s)) {
|
||||
List<String> forms = luceneMorph.getNormalForms(s);
|
||||
if (forms.isEmpty()) {
|
||||
continue;
|
||||
} else if (forms.size() == 1) {
|
||||
termAtt.setEmpty().append(forms.get(0));
|
||||
} else {
|
||||
state = captureState();
|
||||
iterator = forms.iterator();
|
||||
termAtt.setEmpty().append(iterator.next());
|
||||
}
|
||||
}
|
||||
String s = new String(termAtt.buffer(), 0, termAtt.length());
|
||||
if (luceneMorph.checkString(s)) {
|
||||
oldToken = false;
|
||||
iterator = luceneMorph.getNormalForms(s).iterator();
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
String s = iterator.next();
|
||||
termAtt.setEmpty();
|
||||
termAtt.append(s);
|
||||
if (oldToken) {
|
||||
position.setPositionIncrement(0);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
state = null;
|
||||
iterator = null;
|
||||
}
|
||||
}
|
||||
|
122
pom.xml
122
pom.xml
@ -1,10 +1,11 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>morphology</artifactId>
|
||||
<packaging>pom</packaging>
|
||||
<version>1.5</version>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
<name>morphology</name>
|
||||
<url>http://maven.apache.org</url>
|
||||
|
||||
@ -15,12 +16,6 @@
|
||||
<tag>HEAD</tag>
|
||||
</scm>
|
||||
|
||||
<properties>
|
||||
<lucene.version>9.3.0</lucene.version>
|
||||
<morphology.version>1.5</morphology.version>
|
||||
<junit.version>4.13</junit.version>
|
||||
</properties>
|
||||
|
||||
<distributionManagement>
|
||||
<repository>
|
||||
<id>bintray</id>
|
||||
@ -28,36 +23,28 @@
|
||||
</repository>
|
||||
</distributionManagement>
|
||||
|
||||
<licenses>
|
||||
<license>
|
||||
<name>Apache License, Version 2.0</name>
|
||||
<url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
|
||||
<distribution>repo</distribution>
|
||||
</license>
|
||||
</licenses>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-test-framework</artifactId>
|
||||
<version>${lucene.version}</version>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.8.2</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.hamcrest</groupId>
|
||||
<artifactId>hamcrest-all</artifactId>
|
||||
<version>1.3</version>
|
||||
<version>1.1</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-core</artifactId>
|
||||
<version>${lucene.version}</version>
|
||||
<version>5.1.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-analysis-common</artifactId>
|
||||
<version>${lucene.version}</version>
|
||||
<artifactId>lucene-analyzers-common</artifactId>
|
||||
<version>5.1.0</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
@ -65,11 +52,11 @@
|
||||
<repository>
|
||||
<id>maven2-repository.dev.java.net</id>
|
||||
<name>Java.net Repository for Maven</name>
|
||||
<url>https://download.java.net/maven/2/</url>
|
||||
<url>http://download.java.net/maven/2/</url>
|
||||
</repository>
|
||||
<repository>
|
||||
<id>bintray</id>
|
||||
<url>https://dl.bintray.com/akuznetsov/russianmorphology</url>
|
||||
<url>http://dl.bintray.com/akuznetsov/russianmorphology</url>
|
||||
<releases>
|
||||
<enabled>true</enabled>
|
||||
</releases>
|
||||
@ -78,25 +65,13 @@
|
||||
</snapshots>
|
||||
</repository>
|
||||
</repositories>
|
||||
|
||||
<pluginRepositories>
|
||||
<pluginRepository>
|
||||
<id>mc-release</id>
|
||||
<name>maven-license-plugin repository of releases</name>
|
||||
<url>https://mc-repo.googlecode.com/svn/maven2/releases</url>
|
||||
<snapshots>
|
||||
<enabled>false</enabled>
|
||||
</snapshots>
|
||||
<releases>
|
||||
<enabled>true</enabled>
|
||||
</releases>
|
||||
</pluginRepository>
|
||||
</pluginRepositories>
|
||||
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<artifactId>maven-release-plugin</artifactId>
|
||||
<version>2.5.3</version>
|
||||
<version>2.5.2</version>
|
||||
<configuration>
|
||||
<useReleaseProfile>false</useReleaseProfile>
|
||||
<releaseProfiles>release</releaseProfiles>
|
||||
@ -107,37 +82,42 @@
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.8.1</version>
|
||||
<configuration>
|
||||
<source>11</source>
|
||||
<target>11</target>
|
||||
<source>1.7</source>
|
||||
<target>1.7</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin> <!-- usage: http://code.google.com/p/maven-license-plugin/wiki/HowTo -->
|
||||
<artifactId>maven-license-plugin</artifactId>
|
||||
<groupId>com.google.code.maven-license-plugin</groupId>
|
||||
<version>1.4.0</version>
|
||||
<configuration>
|
||||
<basedir>${project.parent.basedir}</basedir>
|
||||
<header>etc/header.txt</header>
|
||||
<excludes>
|
||||
<exclude>**/*.txt</exclude>
|
||||
<exclude>**/*.info</exclude>
|
||||
<exclude>**/pom.xml</exclude>
|
||||
</excludes>
|
||||
<includes>
|
||||
<include>**/src/**</include>
|
||||
</includes>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>test</phase>
|
||||
<goals>
|
||||
<goal>check</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<!--<plugin> <!– usage: http://code.google.com/p/maven-license-plugin/wiki/HowTo –>-->
|
||||
|
||||
<!--<groupId>com.mycila</groupId>-->
|
||||
<!--<artifactId>license-maven-plugin</artifactId>-->
|
||||
<!--<version>2.11</version>-->
|
||||
|
||||
<!--<configuration>-->
|
||||
<!--<properties>-->
|
||||
<!--<owner>Alexander Kuznetsov</owner>-->
|
||||
<!--<!–<email>mathieu.carbou@gmail.com</email>–>-->
|
||||
<!--</properties>-->
|
||||
<!--<basedir>${project.parent.basedir}</basedir>-->
|
||||
<!--<header>etc/header.txt</header>-->
|
||||
<!--<excludes>-->
|
||||
<!--<exclude>**/*.txt</exclude>-->
|
||||
<!--<exclude>**/*.info</exclude>-->
|
||||
<!--<exclude>**/pom.xml</exclude>-->
|
||||
<!--</excludes>-->
|
||||
<!--<includes>-->
|
||||
<!--<include>**/src/**</include>-->
|
||||
<!--</includes>-->
|
||||
<!--</configuration>-->
|
||||
<!--<executions>-->
|
||||
<!--<execution>-->
|
||||
<!--<phase>test</phase>-->
|
||||
<!--<goals>-->
|
||||
<!--<goal>check</goal>-->
|
||||
<!--</goals>-->
|
||||
<!--</execution>-->
|
||||
<!--</executions>-->
|
||||
<!--</plugin>-->
|
||||
</plugins>
|
||||
</build>
|
||||
<profiles>
|
||||
@ -147,7 +127,6 @@
|
||||
<plugins>
|
||||
<plugin>
|
||||
<artifactId>maven-source-plugin</artifactId>
|
||||
<version>3.2.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>attach-sources</id>
|
||||
@ -159,7 +138,6 @@
|
||||
</plugin>
|
||||
<plugin>
|
||||
<artifactId>maven-javadoc-plugin</artifactId>
|
||||
<version>3.3.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>attach-javadocs</id>
|
||||
@ -178,6 +156,6 @@
|
||||
<module>dictionary-reader</module>
|
||||
<module>russian</module>
|
||||
<module>english</module>
|
||||
<module>solr-morphology-analysis</module>
|
||||
<module>context</module>
|
||||
</modules>
|
||||
</project>
|
||||
</project>
|
@ -3,12 +3,13 @@
|
||||
<parent>
|
||||
<artifactId>morphology</artifactId>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<version>1.5</version>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>russian</artifactId>
|
||||
<name>russian</name>
|
||||
<version>1.5</version>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
<url>http://maven.apache.org</url>
|
||||
<dependencies>
|
||||
|
||||
@ -16,15 +17,15 @@
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>morph</artifactId>
|
||||
<version>1.5</version>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>${junit.version}</version>
|
||||
<version>4.8.2</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
</project>
|
||||
</project>
|
@ -20,6 +20,7 @@ import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||
import org.apache.lucene.morphology.SuffixToLongException;
|
||||
import org.apache.lucene.morphology.WrongCharaterException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
|
||||
/**
|
||||
@ -41,7 +42,7 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
|
||||
throw new SuffixToLongException("Suffix length should not be greater then " + WORD_PART_LENGHT + " " + string);
|
||||
int result = 0;
|
||||
for (int i = 0; i < string.length(); i++) {
|
||||
int c = string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
|
||||
int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
|
||||
if (c == 45 - RUSSIAN_SMALL_LETTER_OFFSET) {
|
||||
c = DASH_CODE;
|
||||
}
|
||||
@ -57,7 +58,7 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
|
||||
}
|
||||
|
||||
public int[] encodeToArray(String s) {
|
||||
LinkedList<Integer> integers = new LinkedList<>();
|
||||
LinkedList<Integer> integers = new LinkedList<Integer>();
|
||||
while (s.length() > WORD_PART_LENGHT) {
|
||||
integers.add(encode(s.substring(0, WORD_PART_LENGHT)));
|
||||
s = s.substring(WORD_PART_LENGHT);
|
||||
@ -73,16 +74,16 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
|
||||
}
|
||||
|
||||
public String decodeArray(int[] array) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
String result = "";
|
||||
for (int i : array) {
|
||||
result.append(decode(i));
|
||||
result += decode(i);
|
||||
}
|
||||
return result.toString();
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
public String decode(Integer suffixN) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
String result = "";
|
||||
while (suffixN > 33) {
|
||||
int c = suffixN % 34 + RUSSIAN_SMALL_LETTER_OFFSET;
|
||||
if (c == RUSSIAN_SMALL_LETTER_OFFSET) {
|
||||
@ -90,20 +91,21 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
|
||||
continue;
|
||||
}
|
||||
if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR;
|
||||
result.insert(0, (char) c);
|
||||
result = (char) c + result;
|
||||
suffixN /= 34;
|
||||
}
|
||||
long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET;
|
||||
if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR;
|
||||
result.insert(0, (char) c);
|
||||
return result.toString();
|
||||
result = (char) c + result;
|
||||
return result;
|
||||
}
|
||||
|
||||
public boolean checkCharacter(char c) {
|
||||
int code = c;
|
||||
int code = 0 + c;
|
||||
if (code == 45) return true;
|
||||
code -= RUSSIAN_SMALL_LETTER_OFFSET;
|
||||
return code > 0 && code < 33;
|
||||
if (code > 0 && code < 33) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean checkString(String word) {
|
||||
|
@ -17,7 +17,6 @@ package org.apache.lucene.morphology.russian;
|
||||
|
||||
import org.apache.lucene.morphology.SuffixToLongException;
|
||||
import org.apache.lucene.morphology.WrongCharaterException;
|
||||
import org.hamcrest.MatcherAssert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
@ -25,9 +24,9 @@ import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import static org.hamcrest.core.IsEqual.equalTo;
|
||||
import static org.junit.Assert.assertThat;
|
||||
|
||||
public class RussianLetterDecoderEncoderTest {
|
||||
private RussianLetterDecoderEncoder decoderEncoder;
|
||||
@ -41,12 +40,12 @@ public class RussianLetterDecoderEncoderTest {
|
||||
@Test
|
||||
public void testShouldPreserverStringComporision() throws IOException {
|
||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-monotonic.txt");
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||
String s = bufferedReader.readLine();
|
||||
while (s != null) {
|
||||
String[] qa = s.trim().split(" ");
|
||||
if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT && qa[1].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) {
|
||||
MatcherAssert.assertThat(decoderEncoder.encode(qa[1]) > decoderEncoder.encode(qa[0]), equalTo(true));
|
||||
assertThat(decoderEncoder.encode(qa[1]) > decoderEncoder.encode(qa[0]), equalTo(true));
|
||||
}
|
||||
s = bufferedReader.readLine();
|
||||
}
|
||||
@ -56,13 +55,13 @@ public class RussianLetterDecoderEncoderTest {
|
||||
@Test
|
||||
public void testShouldCorrectDecodeEncode() throws IOException {
|
||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data.txt");
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||
String s = bufferedReader.readLine();
|
||||
while (s != null) {
|
||||
String[] qa = s.trim().split(" ");
|
||||
if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) {
|
||||
Integer encodedSuffix = decoderEncoder.encode(qa[0]);
|
||||
MatcherAssert.assertThat(decoderEncoder.decode(encodedSuffix), equalTo(qa[1]));
|
||||
assertThat(decoderEncoder.decode(encodedSuffix), equalTo(qa[1]));
|
||||
}
|
||||
s = bufferedReader.readLine();
|
||||
}
|
||||
@ -71,12 +70,12 @@ public class RussianLetterDecoderEncoderTest {
|
||||
@Test
|
||||
public void testShouldCorrectDecodeEncodeStringToArray() throws IOException {
|
||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data-for-array.txt");
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||
String s = bufferedReader.readLine();
|
||||
while (s != null) {
|
||||
String[] qa = s.trim().split(" ");
|
||||
int[] ecodedSuffix = decoderEncoder.encodeToArray(qa[0]);
|
||||
MatcherAssert.assertThat(decoderEncoder.decodeArray(ecodedSuffix), equalTo(qa[1]));
|
||||
assertThat(decoderEncoder.decodeArray(ecodedSuffix), equalTo(qa[1]));
|
||||
s = bufferedReader.readLine();
|
||||
}
|
||||
}
|
||||
|
@ -1,70 +0,0 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.morphology;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
import org.apache.lucene.morphology.LuceneMorphology;
|
||||
import org.apache.lucene.morphology.analyzer.MorphologyFilter;
|
||||
import org.apache.lucene.util.ResourceLoader;
|
||||
import org.apache.lucene.util.ResourceLoaderAware;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Factory for {@link MorphologyFilter}, with configurable language
|
||||
* <p>
|
||||
* <b>Note:</b> Two languages are available now: English (default value) and Russian.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="content" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||
* <filter class="solr.MorphologyFilterFactory" language="English"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class MorphologyFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
||||
|
||||
private static final String LANGUAGE_KEY = "language";
|
||||
|
||||
private String language;
|
||||
private LuceneMorphology luceneMorphology;
|
||||
|
||||
public MorphologyFilterFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
|
||||
language = get(args, LANGUAGE_KEY, "English");
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new MorphologyFilter(input, luceneMorphology);
|
||||
}
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
|
||||
String className = "org.apache.lucene.morphology." + language.toLowerCase() + "." + language + "LuceneMorphology";
|
||||
luceneMorphology = loader.newInstance(className, LuceneMorphology.class);
|
||||
}
|
||||
|
||||
public LuceneMorphology getLuceneMorphology() {
|
||||
return luceneMorphology;
|
||||
}
|
||||
}
|
@ -1,75 +0,0 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.morphology;
|
||||
|
||||
import org.apache.lucene.morphology.LuceneMorphology;
|
||||
import org.apache.lucene.morphology.english.EnglishLuceneMorphology;
|
||||
import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
|
||||
import org.apache.lucene.util.ClasspathResourceLoader;
|
||||
import org.apache.lucene.util.ResourceLoader;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class MorphologyFilterFactoryTest {
|
||||
|
||||
private static final String LANGUAGE_KEY = "language";
|
||||
private ResourceLoader loader = new ClasspathResourceLoader(MorphologyFilterFactoryTest.class);
|
||||
private Map<String, String> args;
|
||||
|
||||
@Before
|
||||
public void setUp() {
|
||||
args = new HashMap<>();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void if_RussianLanguageKey_then_CreateRussianMorphologyFilter() {
|
||||
|
||||
args.put(LANGUAGE_KEY, "Russian");
|
||||
MorphologyFilterFactory morphologyFilterFactory = new MorphologyFilterFactory(args);
|
||||
morphologyFilterFactory.inform(loader);
|
||||
|
||||
LuceneMorphology luceneMorphology = morphologyFilterFactory.getLuceneMorphology();
|
||||
|
||||
Assert.assertTrue("Creation the MorphologyFilterFactory with a Russian language key", luceneMorphology instanceof RussianLuceneMorphology);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void if_EnglishLanguageKey_then_CreateEnglishMorphologyFilter() {
|
||||
|
||||
args.put(LANGUAGE_KEY, "English");
|
||||
MorphologyFilterFactory morphologyFilterFactory = new MorphologyFilterFactory(args);
|
||||
morphologyFilterFactory.inform(loader);
|
||||
|
||||
LuceneMorphology luceneMorphology = morphologyFilterFactory.getLuceneMorphology();
|
||||
|
||||
Assert.assertTrue("Creation the MorphologyFilterFactory with a English language key", luceneMorphology instanceof EnglishLuceneMorphology);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void if_NoLanguageKey_then_CreateEnglishMorphologyFilter() {
|
||||
|
||||
MorphologyFilterFactory morphologyFilterFactory = new MorphologyFilterFactory(args);
|
||||
morphologyFilterFactory.inform(loader);
|
||||
|
||||
LuceneMorphology luceneMorphology = morphologyFilterFactory.getLuceneMorphology();
|
||||
|
||||
Assert.assertTrue("Creation the MorphologyFilterFactory without any language keys", luceneMorphology instanceof EnglishLuceneMorphology);
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user