Merge pull request #25 from MysterionRise/master

Clean up and update PR
This commit is contained in:
Alexander Kuznetsov 2021-12-13 15:11:41 +03:00 committed by GitHub
commit 40fe59dd02
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
33 changed files with 186 additions and 200 deletions

View File

@ -1,6 +1,6 @@
name: Java CI name: Java CI
on: [push] on: [push, pull_request]
jobs: jobs:
tests: tests:

View File

@ -6,7 +6,6 @@
<version>1.5</version> <version>1.5</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>org.apache.lucene.morphology</groupId>
<artifactId>dictionary-reader</artifactId> <artifactId>dictionary-reader</artifactId>
<name>dictionary-reader</name> <name>dictionary-reader</name>
<version>1.5</version> <version>1.5</version>

View File

@ -22,20 +22,19 @@ import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
/** /**
* This class contain logic how read * This class contain logic how read
* dictonary and produce word with it all forms. * dictionary and produce word with it all forms.
*/ */
public class DictionaryReader { public class DictionaryReader {
private String fileName; private String fileName;
private String fileEncoding = "windows-1251"; private String fileEncoding = "windows-1251";
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>(); private List<List<FlexiaModel>> wordsFlexias = new ArrayList<>();
private Set<String> ignoredForm = new HashSet<String>(); private Set<String> ignoredForm;
public DictionaryReader(String fileName, Set<String> ignoredForm) { public DictionaryReader(String fileName, Set<String> ignoredForm) {
this.fileName = fileName; this.fileName = fileName;
@ -55,7 +54,7 @@ public class DictionaryReader {
private void readWords(BufferedReader reader, WordProcessor wordProcessor) throws IOException { private void readWords(BufferedReader reader, WordProcessor wordProcessor) throws IOException {
String s = reader.readLine(); String s = reader.readLine();
int count = Integer.valueOf(s); int count = Integer.parseInt(s);
int actual = 0; int actual = 0;
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
s = reader.readLine(); s = reader.readLine();
@ -79,7 +78,7 @@ public class DictionaryReader {
String wordBase = wd[0].toLowerCase(); String wordBase = wd[0].toLowerCase();
if (wordBase.startsWith("-")) return null; if (wordBase.startsWith("-")) return null;
wordBase = "#".equals(wordBase) ? "" : wordBase; wordBase = "#".equals(wordBase) ? "" : wordBase;
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1])); List<FlexiaModel> models = wordsFlexias.get(Integer.parseInt(wd[1]));
FlexiaModel flexiaModel = models.get(0); FlexiaModel flexiaModel = models.get(0);
if (models.size() == 0 || ignoredForm.contains(flexiaModel.getCode())) { if (models.size() == 0 || ignoredForm.contains(flexiaModel.getCode())) {
return null; return null;
@ -96,7 +95,7 @@ public class DictionaryReader {
private void skipBlock(BufferedReader reader) throws IOException { private void skipBlock(BufferedReader reader) throws IOException {
String s = reader.readLine(); String s = reader.readLine();
int count = Integer.valueOf(s); int count = Integer.parseInt(s);
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
reader.readLine(); reader.readLine();
} }
@ -105,7 +104,7 @@ public class DictionaryReader {
private void readPrefix(BufferedReader reader) throws IOException { private void readPrefix(BufferedReader reader) throws IOException {
String s = reader.readLine(); String s = reader.readLine();
int count = Integer.valueOf(s); int count = Integer.parseInt(s);
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
reader.readLine(); reader.readLine();
} }
@ -113,10 +112,10 @@ public class DictionaryReader {
private void readFlexias(BufferedReader reader) throws IOException { private void readFlexias(BufferedReader reader) throws IOException {
String s = reader.readLine(); String s = reader.readLine();
int count = Integer.valueOf(s); int count = Integer.parseInt(s);
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
s = reader.readLine(); s = reader.readLine();
ArrayList<FlexiaModel> flexiaModelArrayList = new ArrayList<FlexiaModel>(); ArrayList<FlexiaModel> flexiaModelArrayList = new ArrayList<>();
wordsFlexias.add(flexiaModelArrayList); wordsFlexias.add(flexiaModelArrayList);
for (String line : s.split("%")) { for (String line : s.split("%")) {
addFlexia(flexiaModelArrayList, line); addFlexia(flexiaModelArrayList, line);

View File

@ -16,6 +16,8 @@
package org.apache.lucene.morphology.dictionary; package org.apache.lucene.morphology.dictionary;
import java.util.Objects;
/** /**
* Represent information of how word form created form it imutible part. * Represent information of how word form created form it imutible part.
*/ */
@ -74,11 +76,9 @@ public class FlexiaModel {
FlexiaModel that = (FlexiaModel) o; FlexiaModel that = (FlexiaModel) o;
if (code != null ? !code.equals(that.code) : that.code != null) return false; if (!Objects.equals(code, that.code)) return false;
if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false; if (!Objects.equals(prefix, that.prefix)) return false;
if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false; return Objects.equals(suffix, that.suffix);
return true;
} }
@Override @Override

View File

@ -29,8 +29,8 @@ import java.util.Map;
public class GrammarReader { public class GrammarReader {
private String fileName; private String fileName;
private String fileEncoding = "windows-1251"; private String fileEncoding = "windows-1251";
private List<String> grammarInfo = new ArrayList<String>(); private List<String> grammarInfo = new ArrayList<>();
private Map<String, Integer> inverseIndex = new HashMap<String, Integer>(); private Map<String, Integer> inverseIndex = new HashMap<>();
public GrammarReader(String fileName) throws IOException { public GrammarReader(String fileName) throws IOException {
this.fileName = fileName; this.fileName = fileName;
@ -50,7 +50,7 @@ public class GrammarReader {
line = line.trim(); line = line.trim();
if (!line.startsWith("//") && line.length() > 0) { if (!line.startsWith("//") && line.length() > 0) {
String[] strings = line.split(" ", 2); String[] strings = line.split(" ", 2);
Integer i = grammarInfo.size(); int i = grammarInfo.size();
inverseIndex.put(strings[0], i); inverseIndex.put(strings[0], i);
grammarInfo.add(i, strings[1]); grammarInfo.add(i, strings[1]);
} }
@ -63,7 +63,7 @@ public class GrammarReader {
} }
public String[] getGrammarInfoAsArray() { public String[] getGrammarInfoAsArray() {
return grammarInfo.toArray(new String[grammarInfo.size()]); return grammarInfo.toArray(new String[0]);
} }
public Map<String, Integer> getGrammarInverseIndex() { public Map<String, Integer> getGrammarInverseIndex() {

View File

@ -15,7 +15,7 @@
*/ */
package org.apache.lucene.morphology.dictionary; package org.apache.lucene.morphology.dictionary;
import java.util.Arrays; import java.util.Collections;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
@ -29,7 +29,7 @@ public class RemoveFlexiaWithPrefixes extends WordFilter {
@Override @Override
public List<WordCard> transform(WordCard wordCard) { public List<WordCard> transform(WordCard wordCard) {
List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>(); List<FlexiaModel> flexiaModelsToRemove = new LinkedList<>();
for (FlexiaModel fm : wordCard.getWordsForms()) { for (FlexiaModel fm : wordCard.getWordsForms()) {
if (fm.getPrefix().length() > 0) { if (fm.getPrefix().length() > 0) {
flexiaModelsToRemove.add(fm); flexiaModelsToRemove.add(fm);
@ -39,6 +39,6 @@ public class RemoveFlexiaWithPrefixes extends WordFilter {
wordCard.removeFlexia(fm); wordCard.removeFlexia(fm);
} }
return new LinkedList<WordCard>(Arrays.asList(wordCard)); return new LinkedList<>(Collections.singletonList(wordCard));
} }
} }

View File

@ -32,13 +32,13 @@ public class RussianAdvSplitterFilter extends WordFilter {
@Override @Override
public List<WordCard> transform(WordCard wordCard) { public List<WordCard> transform(WordCard wordCard) {
LinkedList<WordCard> result = new LinkedList<WordCard>(); LinkedList<WordCard> result = new LinkedList<>();
result.add(wordCard); result.add(wordCard);
String baseWord = ""; String baseWord = "";
String canonicalForm = ""; String canonicalForm = "";
String canonicalSuffix = ""; String canonicalSuffix = "";
List<FlexiaModel> flexiaModels = new LinkedList<FlexiaModel>(); List<FlexiaModel> flexiaModels = new LinkedList<>();
for (FlexiaModel flexiaModel : wordCard.getWordsForms()) { for (FlexiaModel flexiaModel : wordCard.getWordsForms()) {
if (flexiaModel.getPrefix().length() > 0) { if (flexiaModel.getPrefix().length() > 0) {
flexiaModels.add(new FlexiaModel(flexiaModel.getCode(), flexiaModel.getSuffix(), "")); flexiaModels.add(new FlexiaModel(flexiaModel.getCode(), flexiaModel.getSuffix(), ""));

View File

@ -27,9 +27,9 @@ import java.util.*;
//todo made refactoring this class //todo made refactoring this class
public class StatisticsCollector implements WordProcessor { public class StatisticsCollector implements WordProcessor {
private TreeMap<String, Set<Heuristic>> inverseIndex = new TreeMap<String, Set<Heuristic>>(); private TreeMap<String, Set<Heuristic>> inverseIndex = new TreeMap<>();
private Map<Set<Heuristic>, Integer> ruleInverseIndex = new HashMap<Set<Heuristic>, Integer>(); private Map<Set<Heuristic>, Integer> ruleInverseIndex = new HashMap<>();
private List<Set<Heuristic>> rules = new ArrayList<Set<Heuristic>>(); private List<Set<Heuristic>> rules = new ArrayList<>();
private GrammarReader grammarReader; private GrammarReader grammarReader;
private LetterDecoderEncoder decoderEncoder; private LetterDecoderEncoder decoderEncoder;
@ -39,18 +39,14 @@ public class StatisticsCollector implements WordProcessor {
this.decoderEncoder = decoderEncoder; this.decoderEncoder = decoderEncoder;
} }
public void process(WordCard wordCard) throws IOException { public void process(WordCard wordCard) {
cleanWordCard(wordCard); cleanWordCard(wordCard);
String normalStringMorph = wordCard.getWordsForms().get(0).getCode(); String normalStringMorph = wordCard.getWordsForms().get(0).getCode();
for (FlexiaModel fm : wordCard.getWordsForms()) { for (FlexiaModel fm : wordCard.getWordsForms()) {
Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph); Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph);
String form = revertWord(fm.create(wordCard.getBase())); String form = revertWord(fm.create(wordCard.getBase()));
Set<Heuristic> suffixHeuristics = inverseIndex.get(form); Set<Heuristic> suffixHeuristics = inverseIndex.computeIfAbsent(form, k -> new HashSet<>());
if (suffixHeuristics == null) {
suffixHeuristics = new HashSet<Heuristic>();
inverseIndex.put(form, suffixHeuristics);
}
suffixHeuristics.add(heuristic); suffixHeuristics.add(heuristic);
} }
} }
@ -69,7 +65,7 @@ public class StatisticsCollector implements WordProcessor {
public void saveHeuristic(String fileName) throws IOException { public void saveHeuristic(String fileName) throws IOException {
Map<Integer, Integer> dist = new TreeMap<Integer, Integer>(); Map<Integer, Integer> dist = new TreeMap<>();
Set<Heuristic> prevSet = null; Set<Heuristic> prevSet = null;
int count = 0; int count = 0;
for (String key : inverseIndex.keySet()) { for (String key : inverseIndex.keySet()) {
@ -120,11 +116,11 @@ public class StatisticsCollector implements WordProcessor {
} }
private String revertWord(String s) { private String revertWord(String s) {
String result = ""; StringBuilder result = new StringBuilder();
for (int i = 1; i <= s.length(); i++) { for (int i = 1; i <= s.length(); i++) {
result += s.charAt(s.length() - i); result.append(s.charAt(s.length() - i));
} }
return result; return result.toString();
} }
@ -132,15 +128,15 @@ public class StatisticsCollector implements WordProcessor {
String form = fm.create(wordBase); String form = fm.create(wordBase);
String normalForm = wordBase + canonicalSuffix; String normalForm = wordBase + canonicalSuffix;
Integer length = getCommonLength(form, normalForm); Integer length = getCommonLength(form, normalForm);
Integer actualSuffixLengh = form.length() - length; int actualSuffixLengh = form.length() - length;
String actualNormalSuffix = normalForm.substring(length); String actualNormalSuffix = normalForm.substring(length);
Integer integer = grammarReader.getGrammarInverseIndex().get(fm.getCode()); Integer integer = grammarReader.getGrammarInverseIndex().get(fm.getCode());
Integer nf = grammarReader.getGrammarInverseIndex().get(normalSuffixForm); Integer nf = grammarReader.getGrammarInverseIndex().get(normalSuffixForm);
return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue()); return new Heuristic((byte) actualSuffixLengh, actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue());
} }
public static Integer getCommonLength(String s1, String s2) { public static Integer getCommonLength(String s1, String s2) {
Integer length = Math.min(s1.length(), s2.length()); int length = Math.min(s1.length(), s2.length());
for (int i = 0; i < length; i++) { for (int i = 0; i < length; i++) {
if (s1.charAt(i) != s2.charAt(i)) return i; if (s1.charAt(i) != s2.charAt(i)) return i;
} }

View File

@ -26,7 +26,7 @@ public class WordCard {
private String canonicalForm; private String canonicalForm;
private String base; private String base;
private String canonicalSuffix; private String canonicalSuffix;
private List<FlexiaModel> wordsForms = new ArrayList<FlexiaModel>(); private List<FlexiaModel> wordsForms = new ArrayList<>();
public WordCard(String canonicalForm, String base, String canonicalSuffix) { public WordCard(String canonicalForm, String base, String canonicalSuffix) {
this.canonicalForm = canonicalForm; this.canonicalForm = canonicalForm;

View File

@ -17,7 +17,6 @@ package org.apache.lucene.morphology.dictionary;
import org.apache.lucene.morphology.LetterDecoderEncoder; import org.apache.lucene.morphology.LetterDecoderEncoder;
import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
@ -38,7 +37,7 @@ public class WordCleaner extends WordFilter {
if (word.contains("-")) return Collections.emptyList(); if (word.contains("-")) return Collections.emptyList();
if (!decoderEncoder.checkString(word)) return Collections.emptyList(); if (!decoderEncoder.checkString(word)) return Collections.emptyList();
List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>(); List<FlexiaModel> flexiaModelsToRemove = new LinkedList<>();
for (FlexiaModel fm : wordCard.getWordsForms()) { for (FlexiaModel fm : wordCard.getWordsForms()) {
if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) { if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) {
flexiaModelsToRemove.add(fm); flexiaModelsToRemove.add(fm);
@ -48,6 +47,6 @@ public class WordCleaner extends WordFilter {
wordCard.removeFlexia(fm); wordCard.removeFlexia(fm);
} }
return new LinkedList<WordCard>(Arrays.asList(wordCard)); return new LinkedList<>(Collections.singletonList(wordCard));
} }
} }

View File

@ -23,5 +23,5 @@ import java.io.IOException;
*/ */
public interface WordProcessor { public interface WordProcessor {
public void process(WordCard wordCard) throws IOException; void process(WordCard wordCard) throws IOException;
} }

View File

@ -17,7 +17,7 @@ package org.apache.lucene.morphology.dictionary;
import org.apache.lucene.morphology.LetterDecoderEncoder; import org.apache.lucene.morphology.LetterDecoderEncoder;
import java.util.Arrays; import java.util.Collections;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
@ -42,7 +42,7 @@ public class WordStringCleaner extends WordFilter {
//made correct code //made correct code
m.setCode(m.getCode().substring(0, 2)); m.setCode(m.getCode().substring(0, 2));
} }
return new LinkedList<WordCard>(Arrays.asList(wordCard)); return new LinkedList<>(Collections.singletonList(wordCard));
} }

View File

@ -29,7 +29,7 @@ public class EnglishHeuristicBuilder {
GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/egramtab.tab"); GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/egramtab.tab");
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder(); EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>()); DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<>());
StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder); StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder);
WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector); WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector);

View File

@ -28,7 +28,7 @@ public class RussianHeuristicBuilder {
GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/rgramtab.tab"); GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/rgramtab.tab");
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder(); RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>()); DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<>());
StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder); StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder);
WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector); WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector);

View File

@ -23,6 +23,7 @@ import org.apache.lucene.morphology.english.EnglishMorphology;
import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder; import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
import org.apache.lucene.morphology.russian.RussianLuceneMorphology; import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
import org.apache.lucene.morphology.russian.RussianMorphology; import org.apache.lucene.morphology.russian.RussianMorphology;
import org.hamcrest.MatcherAssert;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
@ -33,7 +34,6 @@ import java.util.Map;
import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicLong;
import static org.hamcrest.Matchers.hasItem; import static org.hamcrest.Matchers.hasItem;
import static org.junit.Assert.assertThat;
public class TestAllWords { public class TestAllWords {
@ -73,22 +73,20 @@ public class TestAllWords {
final List<String> morphInfo = grammarInfo.getGrammarInfo(); final List<String> morphInfo = grammarInfo.getGrammarInfo();
final Map<String, Integer> inversIndex = grammarInfo.getGrammarInverseIndex(); final Map<String, Integer> inversIndex = grammarInfo.getGrammarInverseIndex();
DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet<String>()); DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet<>());
final AtomicLong wordCount = new AtomicLong(0); final AtomicLong wordCount = new AtomicLong(0);
Long startTime = System.currentTimeMillis(); long startTime = System.currentTimeMillis();
WordProcessor wordProcessor = new WordProcessor() { WordProcessor wordProcessor = wordCard -> {
public void process(WordCard wordCard) throws IOException {
String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
for (FlexiaModel fm : wordCard.getWordsForms()) { for (FlexiaModel fm : wordCard.getWordsForms()) {
String wordForm = wordCard.getBase() + fm.getSuffix(); String wordForm = wordCard.getBase() + fm.getSuffix();
String morph = morphInfo.get(inversIndex.get(fm.getCode())); String morph = morphInfo.get(inversIndex.get(fm.getCode()));
assertThat(morphology.getMorphInfo(wordForm), hasItem(word + "|" + morph)); MatcherAssert.assertThat(morphology.getMorphInfo(wordForm), hasItem(word + "|" + morph));
assertThat(morphology.getNormalForms(wordForm), hasItem(word)); MatcherAssert.assertThat(morphology.getNormalForms(wordForm), hasItem(word));
wordCount.set(2L + wordCount.get()); wordCount.set(2L + wordCount.get());
} }
}
}; };
WordCleaner wordCleaner = new WordCleaner(decoderEncoder, wordProcessor); WordCleaner wordCleaner = new WordCleaner(decoderEncoder, wordProcessor);
@ -123,18 +121,16 @@ public class TestAllWords {
private void testAllWordForLucene(final LuceneMorphology morphology, LetterDecoderEncoder decoderEncoder, String pathToDic) throws IOException { private void testAllWordForLucene(final LuceneMorphology morphology, LetterDecoderEncoder decoderEncoder, String pathToDic) throws IOException {
final AtomicLong wordCount = new AtomicLong(0); final AtomicLong wordCount = new AtomicLong(0);
Long startTime = System.currentTimeMillis(); long startTime = System.currentTimeMillis();
DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<String>()); DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<>());
WordProcessor wordProcessor = new WordProcessor() { WordProcessor wordProcessor = wordCard -> {
public void process(WordCard wordCard) throws IOException {
String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
for (FlexiaModel fm : wordCard.getWordsForms()) { for (FlexiaModel fm : wordCard.getWordsForms()) {
String wordForm = wordCard.getBase() + fm.getSuffix(); String wordForm = wordCard.getBase() + fm.getSuffix();
assertThat(morphology.getNormalForms(wordForm), hasItem(word)); MatcherAssert.assertThat(morphology.getNormalForms(wordForm), hasItem(word));
wordCount.set(1L + wordCount.get()); wordCount.set(1L + wordCount.get());
} }
}
}; };
WordCleaner wordCleaner = new WordCleaner(decoderEncoder, wordProcessor); WordCleaner wordCleaner = new WordCleaner(decoderEncoder, wordProcessor);

View File

@ -31,9 +31,11 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.morphology.english.EnglishLuceneMorphology; import org.apache.lucene.morphology.english.EnglishLuceneMorphology;
import org.apache.lucene.morphology.russian.RussianAnalyzer; import org.apache.lucene.morphology.russian.RussianAnalyzer;
import org.apache.lucene.morphology.russian.RussianLuceneMorphology; import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
import org.hamcrest.MatcherAssert;
import org.junit.Test; import org.junit.Test;
import java.io.*; import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.*; import java.util.*;
import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.equalTo;
@ -65,24 +67,24 @@ public class AnalyzersTest extends BaseTokenStreamTestCase {
LuceneMorphology englishLuceneMorphology = new EnglishLuceneMorphology(); LuceneMorphology englishLuceneMorphology = new EnglishLuceneMorphology();
MorphologyAnalyzer russianAnalyzer = new MorphologyAnalyzer(russianLuceneMorphology); MorphologyAnalyzer russianAnalyzer = new MorphologyAnalyzer(russianLuceneMorphology);
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("тест пм тест".getBytes()), "UTF-8"); InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("тест пм тест".getBytes()), StandardCharsets.UTF_8);
TokenStream stream = russianAnalyzer.tokenStream(null, reader); TokenStream stream = russianAnalyzer.tokenStream(null, reader);
MorphologyFilter englishFilter = new MorphologyFilter(stream, englishLuceneMorphology); MorphologyFilter englishFilter = new MorphologyFilter(stream, englishLuceneMorphology);
englishFilter.reset(); englishFilter.reset();
while (englishFilter.incrementToken()) { while (englishFilter.incrementToken()) {
System.out.println(englishFilter.toString()); System.out.println(englishFilter);
} }
} }
@Test @Test
public void shouldProvideCorrectIndentForWordWithMelitaForm() throws IOException { public void shouldProvideCorrectIndentForWordWithMelitaForm() throws IOException {
Analyzer morphlogyAnalyzer = new RussianAnalyzer(); Analyzer morphlogyAnalyzer = new RussianAnalyzer();
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), "UTF-8"); InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), StandardCharsets.UTF_8);
TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader); TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
tokenStream.reset(); tokenStream.reset();
Set<String> foromsOfWine = new HashSet<String>(); Set<String> foromsOfWine = new HashSet<>();
foromsOfWine.add("вина"); foromsOfWine.add("вина");
foromsOfWine.add("винo"); foromsOfWine.add("винo");
boolean wordSeen = false; boolean wordSeen = false;
@ -90,7 +92,7 @@ public class AnalyzersTest extends BaseTokenStreamTestCase {
CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class); PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
if(foromsOfWine.contains(charTerm.toString()) && wordSeen){ if(foromsOfWine.contains(charTerm.toString()) && wordSeen){
assertThat(position.getPositionIncrement(),equalTo(0)); MatcherAssert.assertThat(position.getPositionIncrement(),equalTo(0));
} }
if(foromsOfWine.contains(charTerm.toString())){ if(foromsOfWine.contains(charTerm.toString())){
wordSeen = true; wordSeen = true;
@ -100,18 +102,18 @@ public class AnalyzersTest extends BaseTokenStreamTestCase {
private void testAnalayzer(Analyzer morphlogyAnalyzer, String answerPath, String testPath) throws IOException { private void testAnalayzer(Analyzer morphlogyAnalyzer, String answerPath, String testPath) throws IOException {
InputStream stream = this.getClass().getResourceAsStream(answerPath); InputStream stream = this.getClass().getResourceAsStream(answerPath);
BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); BufferedReader breader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" "); String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
HashSet<String> answer = new HashSet<String>(Arrays.asList(strings)); HashSet<String> answer = new HashSet<>(Arrays.asList(strings));
stream.close(); stream.close();
stream = this.getClass().getResourceAsStream(testPath); stream = this.getClass().getResourceAsStream(testPath);
InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); InputStreamReader reader = new InputStreamReader(stream, StandardCharsets.UTF_8);
TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader); TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
tokenStream.reset(); tokenStream.reset();
HashSet<String> result = new HashSet<String>(); HashSet<String> result = new HashSet<>();
while (tokenStream.incrementToken()) { while (tokenStream.incrementToken()) {
CharTermAttribute attribute1 = tokenStream.getAttribute(CharTermAttribute.class); CharTermAttribute attribute1 = tokenStream.getAttribute(CharTermAttribute.class);
result.add(attribute1.toString()); result.add(attribute1.toString());
@ -119,7 +121,7 @@ public class AnalyzersTest extends BaseTokenStreamTestCase {
stream.close(); stream.close();
assertThat(result, equalTo(answer)); MatcherAssert.assertThat(result, equalTo(answer));
} }
@Test @Test

View File

@ -17,19 +17,20 @@ package org.apache.lucene.morphology;
import org.apache.lucene.morphology.russian.RussianLuceneMorphology; import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
import org.apache.lucene.morphology.english.EnglishLuceneMorphology; import org.apache.lucene.morphology.english.EnglishLuceneMorphology;
import org.hamcrest.MatcherAssert;
import org.junit.Test; import org.junit.Test;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import static org.hamcrest.CoreMatchers.equalTo; import static org.hamcrest.CoreMatchers.equalTo;
import static org.junit.Assert.assertThat;
public class LuceneMorphTest { public class LuceneMorphTest {
@ -52,14 +53,13 @@ public class LuceneMorphTest {
private void testMorphology(LuceneMorphology luceneMorph, String pathToTestData) throws IOException { private void testMorphology(LuceneMorphology luceneMorph, String pathToTestData) throws IOException {
InputStream stream = this.getClass().getResourceAsStream(pathToTestData); InputStream stream = this.getClass().getResourceAsStream(pathToTestData);
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
String s = bufferedReader.readLine(); String s = bufferedReader.readLine();
while (s != null) { while (s != null) {
String[] qa = s.trim().split(" "); String[] qa = s.trim().split(" ");
Set<String> result = new HashSet<String>(); Set<String> result = new HashSet<>(Arrays.asList(qa).subList(1, qa.length));
result.addAll(Arrays.asList(qa).subList(1, qa.length)); Set<String> stringList = new HashSet<>(luceneMorph.getNormalForms(qa[0]));
Set<String> stringList = new HashSet<String>(luceneMorph.getNormalForms(qa[0])); MatcherAssert.assertThat(stringList, equalTo(result));
assertThat(stringList, equalTo(result));
s = bufferedReader.readLine(); s = bufferedReader.readLine();
} }
} }

View File

@ -6,7 +6,6 @@
<version>1.5</version> <version>1.5</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>org.apache.lucene.morphology</groupId>
<artifactId>english</artifactId> <artifactId>english</artifactId>
<name>english</name> <name>english</name>
<version>1.5</version> <version>1.5</version>

View File

@ -32,7 +32,7 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder {
if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12); if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12);
int result = 0; int result = 0;
for (int i = 0; i < string.length(); i++) { for (int i = 0; i < string.length(); i++) {
int c = 0 + string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET; int c = string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET;
if (c == 45 - ENGLISH_SMALL_LETTER_OFFSET) { if (c == 45 - ENGLISH_SMALL_LETTER_OFFSET) {
c = DASH_CODE; c = DASH_CODE;
} }
@ -48,7 +48,7 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder {
public int[] encodeToArray(String s) { public int[] encodeToArray(String s) {
ArrayList<Integer> integers = new ArrayList<Integer>(); ArrayList<Integer> integers = new ArrayList<>();
while (s.length() > 6) { while (s.length() > 6) {
integers.add(encode(s.substring(0, 6))); integers.add(encode(s.substring(0, 6)));
s = s.substring(6); s = s.substring(6);
@ -64,16 +64,16 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder {
} }
public String decodeArray(int[] array) { public String decodeArray(int[] array) {
String result = ""; StringBuilder result = new StringBuilder();
for (int i : array) { for (int i : array) {
result += decode(i); result.append(decode(i));
} }
return result; return result.toString();
} }
public String decode(Integer suffixN) { public String decode(Integer suffixN) {
String result = ""; StringBuilder result = new StringBuilder();
while (suffixN > 27) { while (suffixN > 27) {
int c = suffixN % 28 + ENGLISH_SMALL_LETTER_OFFSET; int c = suffixN % 28 + ENGLISH_SMALL_LETTER_OFFSET;
if (c == ENGLISH_SMALL_LETTER_OFFSET) { if (c == ENGLISH_SMALL_LETTER_OFFSET) {
@ -81,21 +81,20 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder {
continue; continue;
} }
if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR; if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
result = (char) c + result; result.insert(0, (char) c);
suffixN /= 28; suffixN /= 28;
} }
long c = suffixN + ENGLISH_SMALL_LETTER_OFFSET; long c = suffixN + ENGLISH_SMALL_LETTER_OFFSET;
if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR; if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
result = (char) c + result; result.insert(0, (char) c);
return result; return result.toString();
} }
public boolean checkCharacter(char c) { public boolean checkCharacter(char c) {
int code = 0 + c; int code = c;
if (code == 45) return true; if (code == 45) return true;
code -= ENGLISH_SMALL_LETTER_OFFSET; code -= ENGLISH_SMALL_LETTER_OFFSET;
if (code > 0 && code < 27) return true; return code > 0 && code < 27;
return false;
} }

View File

@ -16,7 +16,8 @@
package org.apache.lucene.morphology.english; package org.apache.lucene.morphology.english;
import static org.hamcrest.core.IsEqual.equalTo; import static org.hamcrest.core.IsEqual.equalTo;
import static org.junit.Assert.assertThat;
import org.hamcrest.MatcherAssert;
import org.junit.Before; import org.junit.Before;
@ -30,11 +31,11 @@ public class EnglishLetterDecoderEncoderTest {
@org.junit.Test @org.junit.Test
public void testDecodeEncodeToArray() { public void testDecodeEncodeToArray() {
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("abcdefghijklmnopqrstuvwxyz")), equalTo("abcdefghijklmnopqrstuvwxyz")); MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("abcdefghijklmnopqrstuvwxyz")), equalTo("abcdefghijklmnopqrstuvwxyz"));
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("xyz")), equalTo("xyz")); MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("xyz")), equalTo("xyz"));
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrty")), equalTo("ytrrty")); MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrty")), equalTo("ytrrty"));
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyz")), equalTo("ytrrtyz")); MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyz")), equalTo("ytrrtyz"));
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyzqwqwe")), equalTo("ytrrtyzqwqwe")); MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyzqwqwe")), equalTo("ytrrtyzqwqwe"));
} }
} }

View File

@ -16,9 +16,9 @@
package org.apache.lucene.morphology.english.stemmer; package org.apache.lucene.morphology.english.stemmer;
import org.apache.lucene.morphology.english.EnglishLuceneMorphology; import org.apache.lucene.morphology.english.EnglishLuceneMorphology;
import org.hamcrest.MatcherAssert;
import org.junit.Test; import org.junit.Test;
import static org.hamcrest.core.IsEqual.equalTo; import static org.hamcrest.core.IsEqual.equalTo;
import static org.junit.Assert.assertThat;
public class EnglishStemmerTest { public class EnglishStemmerTest {
@ -26,24 +26,24 @@ public class EnglishStemmerTest {
public void testGetStemmedWord() throws Exception { public void testGetStemmedWord() throws Exception {
EnglishLuceneMorphology englishLuceneMorphology = new EnglishLuceneMorphology(); EnglishLuceneMorphology englishLuceneMorphology = new EnglishLuceneMorphology();
EnglishStemmer englishStemmer = new EnglishStemmer(englishLuceneMorphology); EnglishStemmer englishStemmer = new EnglishStemmer(englishLuceneMorphology);
assertThat(englishStemmer.getStemmedWord("running"),equalTo("run")); MatcherAssert.assertThat(englishStemmer.getStemmedWord("running"),equalTo("run"));
assertThat(englishStemmer.getStemmedWord("run"),equalTo("run")); MatcherAssert.assertThat(englishStemmer.getStemmedWord("run"),equalTo("run"));
assertThat(englishStemmer.getStemmedWord("killed"),equalTo("kill")); MatcherAssert.assertThat(englishStemmer.getStemmedWord("killed"),equalTo("kill"));
assertThat(englishStemmer.getStemmedWord("kill"),equalTo("kill")); MatcherAssert.assertThat(englishStemmer.getStemmedWord("kill"),equalTo("kill"));
assertThat(englishStemmer.getStemmedWord("networking"),equalTo("network")); MatcherAssert.assertThat(englishStemmer.getStemmedWord("networking"),equalTo("network"));
assertThat(englishStemmer.getStemmedWord("network"),equalTo("network")); MatcherAssert.assertThat(englishStemmer.getStemmedWord("network"),equalTo("network"));
assertThat(englishStemmer.getStemmedWord("statistics"),equalTo("statistic")); MatcherAssert.assertThat(englishStemmer.getStemmedWord("statistics"),equalTo("statistic"));
assertThat(englishStemmer.getStemmedWord("statistic"),equalTo("statistic")); MatcherAssert.assertThat(englishStemmer.getStemmedWord("statistic"),equalTo("statistic"));
assertThat(englishStemmer.getStemmedWord("stats"),equalTo("stat")); MatcherAssert.assertThat(englishStemmer.getStemmedWord("stats"),equalTo("stat"));
assertThat(englishStemmer.getStemmedWord("stat"),equalTo("stat")); MatcherAssert.assertThat(englishStemmer.getStemmedWord("stat"),equalTo("stat"));
assertThat(englishStemmer.getStemmedWord("countries"),equalTo("country")); MatcherAssert.assertThat(englishStemmer.getStemmedWord("countries"),equalTo("country"));
assertThat(englishStemmer.getStemmedWord("country"),equalTo("country")); MatcherAssert.assertThat(englishStemmer.getStemmedWord("country"),equalTo("country"));
assertThat(englishStemmer.getStemmedWord("delete"),equalTo("delete")); MatcherAssert.assertThat(englishStemmer.getStemmedWord("delete"),equalTo("delete"));
assertThat(englishStemmer.getStemmedWord("ended"),equalTo("end")); MatcherAssert.assertThat(englishStemmer.getStemmedWord("ended"),equalTo("end"));
assertThat(englishStemmer.getStemmedWord("end"),equalTo("end")); MatcherAssert.assertThat(englishStemmer.getStemmedWord("end"),equalTo("end"));
assertThat(englishStemmer.getStemmedWord("ends"),equalTo("end")); MatcherAssert.assertThat(englishStemmer.getStemmedWord("ends"),equalTo("end"));
assertThat(englishStemmer.getStemmedWord("given"),equalTo("give")); MatcherAssert.assertThat(englishStemmer.getStemmedWord("given"),equalTo("give"));
assertThat(englishStemmer.getStemmedWord("give"),equalTo("give")); MatcherAssert.assertThat(englishStemmer.getStemmedWord("give"),equalTo("give"));
assertThat(englishStemmer.getStemmedWord("log4j"),equalTo("log4j")); MatcherAssert.assertThat(englishStemmer.getStemmedWord("log4j"),equalTo("log4j"));
} }
} }

View File

@ -6,7 +6,6 @@
<version>1.5</version> <version>1.5</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>org.apache.lucene.morphology</groupId>
<artifactId>morph</artifactId> <artifactId>morph</artifactId>
<name>morph</name> <name>morph</name>
<version>1.5</version> <version>1.5</version>

View File

@ -21,7 +21,7 @@ import java.util.ArrayList;
public abstract class BaseLetterDecoderEncoder implements LetterDecoderEncoder { public abstract class BaseLetterDecoderEncoder implements LetterDecoderEncoder {
public int[] encodeToArray(String s) { public int[] encodeToArray(String s) {
ArrayList<Integer> integers = new ArrayList<Integer>(); ArrayList<Integer> integers = new ArrayList<>();
while (s.length() > 6) { while (s.length() > 6) {
integers.add(encode(s.substring(0, 6))); integers.add(encode(s.substring(0, 6)));
s = s.substring(6); s = s.substring(6);
@ -37,11 +37,11 @@ public abstract class BaseLetterDecoderEncoder implements LetterDecoderEncoder {
} }
public String decodeArray(int[] array) { public String decodeArray(int[] array) {
String result = ""; StringBuilder result = new StringBuilder();
for (int i : array) { for (int i : array) {
result += decode(i); result.append(decode(i));
} }
return result; return result.toString();
} }
public boolean checkString(String word) { public boolean checkString(String word) {

View File

@ -16,6 +16,7 @@
package org.apache.lucene.morphology; package org.apache.lucene.morphology;
import java.io.Serializable; import java.io.Serializable;
import java.util.Objects;
public class Heuristic implements Serializable { public class Heuristic implements Serializable {
@ -26,10 +27,10 @@ public class Heuristic implements Serializable {
public Heuristic(String s) { public Heuristic(String s) {
String[] strings = s.split("\\|"); String[] strings = s.split("\\|");
actualSuffixLength = Byte.valueOf(strings[0]); actualSuffixLength = Byte.parseByte(strings[0]);
actualNormalSuffix = strings[1]; actualNormalSuffix = strings[1];
formMorphInfo = Short.valueOf(strings[2]); formMorphInfo = Short.parseShort(strings[2]);
normalFormMorphInfo = Short.valueOf(strings[3]); normalFormMorphInfo = Short.parseShort(strings[3]);
} }
public Heuristic(byte actualSuffixLength, String actualNormalSuffix, short formMorphInfo, short normalFormMorphInfo) { public Heuristic(byte actualSuffixLength, String actualNormalSuffix, short formMorphInfo, short normalFormMorphInfo) {
@ -70,15 +71,12 @@ public class Heuristic implements Serializable {
if (actualSuffixLength != heuristic.actualSuffixLength) return false; if (actualSuffixLength != heuristic.actualSuffixLength) return false;
if (formMorphInfo != heuristic.formMorphInfo) return false; if (formMorphInfo != heuristic.formMorphInfo) return false;
if (normalFormMorphInfo != heuristic.normalFormMorphInfo) return false; if (normalFormMorphInfo != heuristic.normalFormMorphInfo) return false;
if (actualNormalSuffix != null ? !actualNormalSuffix.equals(heuristic.actualNormalSuffix) : heuristic.actualNormalSuffix != null) return Objects.equals(actualNormalSuffix, heuristic.actualNormalSuffix);
return false;
return true;
} }
@Override @Override
public int hashCode() { public int hashCode() {
int result = (int) actualSuffixLength; int result = actualSuffixLength;
result = 31 * result + (actualNormalSuffix != null ? actualNormalSuffix.hashCode() : 0); result = 31 * result + (actualNormalSuffix != null ? actualNormalSuffix.hashCode() : 0);
result = 31 * result + (int) formMorphInfo; result = 31 * result + (int) formMorphInfo;
result = 31 * result + (int) normalFormMorphInfo; result = 31 * result + (int) normalFormMorphInfo;

View File

@ -17,17 +17,17 @@ package org.apache.lucene.morphology;
public interface LetterDecoderEncoder { public interface LetterDecoderEncoder {
public Integer encode(String string); Integer encode(String string);
public int[] encodeToArray(String s); int[] encodeToArray(String s);
public String decodeArray(int[] array); String decodeArray(int[] array);
public String decode(Integer suffixN); String decode(Integer suffixN);
public boolean checkCharacter(char c); boolean checkCharacter(char c);
public boolean checkString(String word); boolean checkString(String word);
public String cleanString(String s); String cleanString(String s);
} }

View File

@ -34,13 +34,13 @@ public class LuceneMorphology extends MorphologyImpl {
protected void readRules(BufferedReader bufferedReader) throws IOException { protected void readRules(BufferedReader bufferedReader) throws IOException {
String s; String s;
Integer amount; int amount;
s = bufferedReader.readLine(); s = bufferedReader.readLine();
amount = Integer.valueOf(s); amount = Integer.parseInt(s);
rules = new Heuristic[amount][]; rules = new Heuristic[amount][];
for (int i = 0; i < amount; i++) { for (int i = 0; i < amount; i++) {
String s1 = bufferedReader.readLine(); String s1 = bufferedReader.readLine();
Integer ruleLenght = Integer.valueOf(s1); int ruleLenght = Integer.parseInt(s1);
Heuristic[] heuristics = new Heuristic[ruleLenght]; Heuristic[] heuristics = new Heuristic[ruleLenght];
for (int j = 0; j < ruleLenght; j++) { for (int j = 0; j < ruleLenght; j++) {
heuristics[j] = new Heuristic(bufferedReader.readLine()); heuristics[j] = new Heuristic(bufferedReader.readLine());
@ -51,7 +51,7 @@ public class LuceneMorphology extends MorphologyImpl {
private Heuristic[] modeifyHeuristic(Heuristic[] heuristics) { private Heuristic[] modeifyHeuristic(Heuristic[] heuristics) {
ArrayList<Heuristic> result = new ArrayList<Heuristic>(); ArrayList<Heuristic> result = new ArrayList<>();
for (Heuristic heuristic : heuristics) { for (Heuristic heuristic : heuristics) {
boolean isAdded = true; boolean isAdded = true;
for (Heuristic ch : result) { for (Heuristic ch : result) {
@ -61,7 +61,7 @@ public class LuceneMorphology extends MorphologyImpl {
result.add(heuristic); result.add(heuristic);
} }
} }
return result.toArray(new Heuristic[result.size()]); return result.toArray(new Heuristic[0]);
} }
public boolean checkString(String s) { public boolean checkString(String s) {

View File

@ -17,6 +17,7 @@ package org.apache.lucene.morphology;
import java.io.*; import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -47,7 +48,7 @@ public class MorphologyImpl implements Morphology {
} }
public List<String> getNormalForms(String s) { public List<String> getNormalForms(String s) {
ArrayList<String> result = new ArrayList<String>(); ArrayList<String> result = new ArrayList<>();
int[] ints = decoderEncoder.encodeToArray(revertWord(s)); int[] ints = decoderEncoder.encodeToArray(revertWord(s));
int ruleId = findRuleId(ints); int ruleId = findRuleId(ints);
boolean notSeenEmptyString = true; boolean notSeenEmptyString = true;
@ -64,7 +65,7 @@ public class MorphologyImpl implements Morphology {
} }
public List<String> getMorphInfo(String s) { public List<String> getMorphInfo(String s) {
ArrayList<String> result = new ArrayList<String>(); ArrayList<String> result = new ArrayList<>();
int[] ints = decoderEncoder.encodeToArray(revertWord(s)); int[] ints = decoderEncoder.encodeToArray(revertWord(s));
int ruleId = findRuleId(ints); int ruleId = findRuleId(ints);
for (Heuristic h : rules[rulesId[ruleId]]) { for (Heuristic h : rules[rulesId[ruleId]]) {
@ -100,14 +101,14 @@ public class MorphologyImpl implements Morphology {
private int compareToInts(int[] i1, int[] i2) { private int compareToInts(int[] i1, int[] i2) {
int minLength = Math.min(i1.length, i2.length); int minLength = Math.min(i1.length, i2.length);
for (int i = 0; i < minLength; i++) { for (int i = 0; i < minLength; i++) {
int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1); int i3 = Integer.compare(i1[i], i2[i]);
if (i3 != 0) return i3; if (i3 != 0) return i3;
} }
return i1.length - i2.length; return i1.length - i2.length;
} }
public void writeToFile(String fileName) throws IOException { public void writeToFile(String fileName) throws IOException {
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8"); OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8);
writer.write(separators.length + "\n"); writer.write(separators.length + "\n");
for (int[] i : separators) { for (int[] i : separators) {
writer.write(i.length + "\n"); writer.write(i.length + "\n");
@ -138,7 +139,7 @@ public class MorphologyImpl implements Morphology {
} }
private void readFromInputStream(InputStream inputStream) throws IOException { private void readFromInputStream(InputStream inputStream) throws IOException {
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8));
String s = bufferedReader.readLine(); String s = bufferedReader.readLine();
Integer amount = Integer.valueOf(s); Integer amount = Integer.valueOf(s);
@ -153,9 +154,9 @@ public class MorphologyImpl implements Morphology {
private void readGrammaInfo(BufferedReader bufferedReader) throws IOException { private void readGrammaInfo(BufferedReader bufferedReader) throws IOException {
String s; String s;
Integer amount; int amount;
s = bufferedReader.readLine(); s = bufferedReader.readLine();
amount = Integer.valueOf(s); amount = Integer.parseInt(s);
grammarInfo = new String[amount]; grammarInfo = new String[amount];
for (int i = 0; i < amount; i++) { for (int i = 0; i < amount; i++) {
grammarInfo[i] = bufferedReader.readLine(); grammarInfo[i] = bufferedReader.readLine();
@ -164,13 +165,13 @@ public class MorphologyImpl implements Morphology {
protected void readRules(BufferedReader bufferedReader) throws IOException { protected void readRules(BufferedReader bufferedReader) throws IOException {
String s; String s;
Integer amount; int amount;
s = bufferedReader.readLine(); s = bufferedReader.readLine();
amount = Integer.valueOf(s); amount = Integer.parseInt(s);
rules = new Heuristic[amount][]; rules = new Heuristic[amount][];
for (int i = 0; i < amount; i++) { for (int i = 0; i < amount; i++) {
String s1 = bufferedReader.readLine(); String s1 = bufferedReader.readLine();
Integer ruleLength = Integer.valueOf(s1); int ruleLength = Integer.parseInt(s1);
rules[i] = new Heuristic[ruleLength]; rules[i] = new Heuristic[ruleLength];
for (int j = 0; j < ruleLength; j++) { for (int j = 0; j < ruleLength; j++) {
rules[i][j] = new Heuristic(bufferedReader.readLine()); rules[i][j] = new Heuristic(bufferedReader.readLine());
@ -182,7 +183,7 @@ public class MorphologyImpl implements Morphology {
rulesId = new short[amount]; rulesId = new short[amount];
for (int i = 0; i < amount; i++) { for (int i = 0; i < amount; i++) {
String s1 = bufferedReader.readLine(); String s1 = bufferedReader.readLine();
rulesId[i] = Short.valueOf(s1); rulesId[i] = Short.parseShort(s1);
} }
} }
@ -190,10 +191,10 @@ public class MorphologyImpl implements Morphology {
separators = new int[amount][]; separators = new int[amount][];
for (int i = 0; i < amount; i++) { for (int i = 0; i < amount; i++) {
String s1 = bufferedReader.readLine(); String s1 = bufferedReader.readLine();
Integer wordLenght = Integer.valueOf(s1); int wordLenght = Integer.parseInt(s1);
separators[i] = new int[wordLenght]; separators[i] = new int[wordLenght];
for (int j = 0; j < wordLenght; j++) { for (int j = 0; j < wordLenght; j++) {
separators[i][j] = Integer.valueOf(bufferedReader.readLine()); separators[i][j] = Integer.parseInt(bufferedReader.readLine());
} }
} }
} }

View File

@ -73,6 +73,6 @@ public class MorphologyAnalyzer extends Analyzer {
TokenFilter filter = new LowerCaseFilter(src); TokenFilter filter = new LowerCaseFilter(src);
filter = new MorphologyFilter(filter, luceneMorph); filter = new MorphologyFilter(filter, luceneMorph);
return new TokenStreamComponents(r -> src.setReader(r), filter); return new TokenStreamComponents(src::setReader, filter);
} }
} }

12
pom.xml
View File

@ -16,7 +16,7 @@
</scm> </scm>
<properties> <properties>
<lucene.version>8.7.0</lucene.version> <lucene.version>8.11.0</lucene.version>
<morphology.version>1.5</morphology.version> <morphology.version>1.5</morphology.version>
<junit.version>4.13</junit.version> <junit.version>4.13</junit.version>
</properties> </properties>
@ -46,7 +46,7 @@
<dependency> <dependency>
<groupId>org.hamcrest</groupId> <groupId>org.hamcrest</groupId>
<artifactId>hamcrest-all</artifactId> <artifactId>hamcrest-all</artifactId>
<version>1.1</version> <version>1.3</version>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>
<dependency> <dependency>
@ -109,8 +109,8 @@
<artifactId>maven-compiler-plugin</artifactId> <artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version> <version>3.8.1</version>
<configuration> <configuration>
<source>1.8</source> <source>11</source>
<target>1.8</target> <target>11</target>
</configuration> </configuration>
</plugin> </plugin>
<plugin> <!-- usage: http://code.google.com/p/maven-license-plugin/wiki/HowTo --> <plugin> <!-- usage: http://code.google.com/p/maven-license-plugin/wiki/HowTo -->
@ -147,7 +147,7 @@
<plugins> <plugins>
<plugin> <plugin>
<artifactId>maven-source-plugin</artifactId> <artifactId>maven-source-plugin</artifactId>
<version>3.0.1</version> <version>3.2.1</version>
<executions> <executions>
<execution> <execution>
<id>attach-sources</id> <id>attach-sources</id>
@ -159,7 +159,7 @@
</plugin> </plugin>
<plugin> <plugin>
<artifactId>maven-javadoc-plugin</artifactId> <artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.4</version> <version>3.3.1</version>
<executions> <executions>
<execution> <execution>
<id>attach-javadocs</id> <id>attach-javadocs</id>

View File

@ -6,7 +6,6 @@
<version>1.5</version> <version>1.5</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>org.apache.lucene.morphology</groupId>
<artifactId>russian</artifactId> <artifactId>russian</artifactId>
<name>russian</name> <name>russian</name>
<version>1.5</version> <version>1.5</version>
@ -23,7 +22,7 @@
<dependency> <dependency>
<groupId>junit</groupId> <groupId>junit</groupId>
<artifactId>junit</artifactId> <artifactId>junit</artifactId>
<version>4.13.1</version> <version>${junit.version}</version>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>

View File

@ -20,7 +20,6 @@ import org.apache.lucene.morphology.LetterDecoderEncoder;
import org.apache.lucene.morphology.SuffixToLongException; import org.apache.lucene.morphology.SuffixToLongException;
import org.apache.lucene.morphology.WrongCharaterException; import org.apache.lucene.morphology.WrongCharaterException;
import java.util.ArrayList;
import java.util.LinkedList; import java.util.LinkedList;
/** /**
@ -42,7 +41,7 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
throw new SuffixToLongException("Suffix length should not be greater then " + WORD_PART_LENGHT + " " + string); throw new SuffixToLongException("Suffix length should not be greater then " + WORD_PART_LENGHT + " " + string);
int result = 0; int result = 0;
for (int i = 0; i < string.length(); i++) { for (int i = 0; i < string.length(); i++) {
int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET; int c = string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
if (c == 45 - RUSSIAN_SMALL_LETTER_OFFSET) { if (c == 45 - RUSSIAN_SMALL_LETTER_OFFSET) {
c = DASH_CODE; c = DASH_CODE;
} }
@ -58,7 +57,7 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
} }
public int[] encodeToArray(String s) { public int[] encodeToArray(String s) {
LinkedList<Integer> integers = new LinkedList<Integer>(); LinkedList<Integer> integers = new LinkedList<>();
while (s.length() > WORD_PART_LENGHT) { while (s.length() > WORD_PART_LENGHT) {
integers.add(encode(s.substring(0, WORD_PART_LENGHT))); integers.add(encode(s.substring(0, WORD_PART_LENGHT)));
s = s.substring(WORD_PART_LENGHT); s = s.substring(WORD_PART_LENGHT);
@ -74,16 +73,16 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
} }
public String decodeArray(int[] array) { public String decodeArray(int[] array) {
String result = ""; StringBuilder result = new StringBuilder();
for (int i : array) { for (int i : array) {
result += decode(i); result.append(decode(i));
} }
return result; return result.toString();
} }
public String decode(Integer suffixN) { public String decode(Integer suffixN) {
String result = ""; StringBuilder result = new StringBuilder();
while (suffixN > 33) { while (suffixN > 33) {
int c = suffixN % 34 + RUSSIAN_SMALL_LETTER_OFFSET; int c = suffixN % 34 + RUSSIAN_SMALL_LETTER_OFFSET;
if (c == RUSSIAN_SMALL_LETTER_OFFSET) { if (c == RUSSIAN_SMALL_LETTER_OFFSET) {
@ -91,21 +90,20 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
continue; continue;
} }
if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR; if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR;
result = (char) c + result; result.insert(0, (char) c);
suffixN /= 34; suffixN /= 34;
} }
long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET; long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET;
if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR; if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR;
result = (char) c + result; result.insert(0, (char) c);
return result; return result.toString();
} }
public boolean checkCharacter(char c) { public boolean checkCharacter(char c) {
int code = 0 + c; int code = c;
if (code == 45) return true; if (code == 45) return true;
code -= RUSSIAN_SMALL_LETTER_OFFSET; code -= RUSSIAN_SMALL_LETTER_OFFSET;
if (code > 0 && code < 33) return true; return code > 0 && code < 33;
return false;
} }
public boolean checkString(String word) { public boolean checkString(String word) {

View File

@ -17,6 +17,7 @@ package org.apache.lucene.morphology.russian;
import org.apache.lucene.morphology.SuffixToLongException; import org.apache.lucene.morphology.SuffixToLongException;
import org.apache.lucene.morphology.WrongCharaterException; import org.apache.lucene.morphology.WrongCharaterException;
import org.hamcrest.MatcherAssert;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
@ -24,9 +25,9 @@ import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import static org.hamcrest.core.IsEqual.equalTo; import static org.hamcrest.core.IsEqual.equalTo;
import static org.junit.Assert.assertThat;
public class RussianLetterDecoderEncoderTest { public class RussianLetterDecoderEncoderTest {
private RussianLetterDecoderEncoder decoderEncoder; private RussianLetterDecoderEncoder decoderEncoder;
@ -40,12 +41,12 @@ public class RussianLetterDecoderEncoderTest {
@Test @Test
public void testShouldPreserverStringComporision() throws IOException { public void testShouldPreserverStringComporision() throws IOException {
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-monotonic.txt"); InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-monotonic.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
String s = bufferedReader.readLine(); String s = bufferedReader.readLine();
while (s != null) { while (s != null) {
String[] qa = s.trim().split(" "); String[] qa = s.trim().split(" ");
if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT && qa[1].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) { if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT && qa[1].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) {
assertThat(decoderEncoder.encode(qa[1]) > decoderEncoder.encode(qa[0]), equalTo(true)); MatcherAssert.assertThat(decoderEncoder.encode(qa[1]) > decoderEncoder.encode(qa[0]), equalTo(true));
} }
s = bufferedReader.readLine(); s = bufferedReader.readLine();
} }
@ -55,13 +56,13 @@ public class RussianLetterDecoderEncoderTest {
@Test @Test
public void testShouldCorrectDecodeEncode() throws IOException { public void testShouldCorrectDecodeEncode() throws IOException {
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data.txt"); InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
String s = bufferedReader.readLine(); String s = bufferedReader.readLine();
while (s != null) { while (s != null) {
String[] qa = s.trim().split(" "); String[] qa = s.trim().split(" ");
if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) { if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) {
Integer encodedSuffix = decoderEncoder.encode(qa[0]); Integer encodedSuffix = decoderEncoder.encode(qa[0]);
assertThat(decoderEncoder.decode(encodedSuffix), equalTo(qa[1])); MatcherAssert.assertThat(decoderEncoder.decode(encodedSuffix), equalTo(qa[1]));
} }
s = bufferedReader.readLine(); s = bufferedReader.readLine();
} }
@ -70,12 +71,12 @@ public class RussianLetterDecoderEncoderTest {
@Test @Test
public void testShouldCorrectDecodeEncodeStringToArray() throws IOException { public void testShouldCorrectDecodeEncodeStringToArray() throws IOException {
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data-for-array.txt"); InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data-for-array.txt");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
String s = bufferedReader.readLine(); String s = bufferedReader.readLine();
while (s != null) { while (s != null) {
String[] qa = s.trim().split(" "); String[] qa = s.trim().split(" ");
int[] ecodedSuffix = decoderEncoder.encodeToArray(qa[0]); int[] ecodedSuffix = decoderEncoder.encodeToArray(qa[0]);
assertThat(decoderEncoder.decodeArray(ecodedSuffix), equalTo(qa[1])); MatcherAssert.assertThat(decoderEncoder.decodeArray(ecodedSuffix), equalTo(qa[1]));
s = bufferedReader.readLine(); s = bufferedReader.readLine();
} }
} }

View File

@ -30,7 +30,7 @@ import java.util.Map;
public class MorphologyFilterFactoryTest { public class MorphologyFilterFactoryTest {
private static final String LANGUAGE_KEY = "language"; private static final String LANGUAGE_KEY = "language";
private ResourceLoader loader = new ClasspathResourceLoader(); private ResourceLoader loader = new ClasspathResourceLoader(MorphologyFilterFactoryTest.class);
private Map<String, String> args; private Map<String, String> args;
@Before @Before