From 96900e334e065b0bc95f5b8e6a3f4f4162c9b69f Mon Sep 17 00:00:00 2001 From: Konstantin Perikov Date: Fri, 10 Dec 2021 18:15:47 +0000 Subject: [PATCH 1/2] update to jdk11, fix migration issues, some typos, deprecated apis, bump up Lucene to 8.11 --- dictionary-reader/pom.xml | 1 - .../dictionary/DictionaryReader.java | 19 ++++----- .../morphology/dictionary/FlexiaModel.java | 10 ++--- .../morphology/dictionary/GrammarReader.java | 8 ++-- .../dictionary/RemoveFlexiaWithPrefixes.java | 6 +-- .../dictionary/RussianAdvSplitterFilter.java | 4 +- .../dictionary/StatisticsCollector.java | 28 ++++++------- .../morphology/dictionary/WordCard.java | 2 +- .../morphology/dictionary/WordCleaner.java | 5 +-- .../morphology/dictionary/WordProcessor.java | 2 +- .../dictionary/WordStringCleaner.java | 4 +- .../generator/EnglishHeuristicBuilder.java | 4 +- .../generator/RussianHeuristicBuilder.java | 2 +- .../java/org/apache/lucene/TestAllWords.java | 42 +++++++++---------- .../lucene/morphology/AnalyzersTest.java | 22 +++++----- .../lucene/morphology/LuceneMorphTest.java | 12 +++--- english/pom.xml | 3 +- .../english/EnglishLetterDecoderEncoder.java | 23 +++++----- .../EnglishLetterDecoderEncoderTest.java | 13 +++--- .../english/stemmer/EnglishStemmerTest.java | 40 +++++++++--------- morph/pom.xml | 1 - .../morphology/BaseLetterDecoderEncoder.java | 8 ++-- .../apache/lucene/morphology/Heuristic.java | 14 +++---- .../morphology/LetterDecoderEncoder.java | 14 +++---- .../lucene/morphology/LuceneMorphology.java | 10 ++--- .../lucene/morphology/MorphologyImpl.java | 27 ++++++------ .../analyzer/MorphologyAnalyzer.java | 2 +- pom.xml | 12 +++--- russian/pom.xml | 3 +- .../russian/RussianLetterDecoderEncoder.java | 24 +++++------ .../RussianLetterDecoderEncoderTest.java | 15 +++---- .../MorphologyFilterFactoryTest.java | 2 +- 32 files changed, 184 insertions(+), 198 deletions(-) diff --git a/dictionary-reader/pom.xml b/dictionary-reader/pom.xml index fb4d65a..abf4f2e 100644 --- a/dictionary-reader/pom.xml +++ b/dictionary-reader/pom.xml @@ -6,7 +6,6 @@ 1.5 4.0.0 - org.apache.lucene.morphology dictionary-reader dictionary-reader 1.5 diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictionaryReader.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictionaryReader.java index 37a4794..27d3da1 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictionaryReader.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/DictionaryReader.java @@ -22,20 +22,19 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; -import java.util.HashSet; import java.util.List; import java.util.Set; /** * This class contain logic how read - * dictonary and produce word with it all forms. + * dictionary and produce word with it all forms. */ public class DictionaryReader { private String fileName; private String fileEncoding = "windows-1251"; - private List> wordsFlexias = new ArrayList>(); - private Set ignoredForm = new HashSet(); + private List> wordsFlexias = new ArrayList<>(); + private Set ignoredForm; public DictionaryReader(String fileName, Set ignoredForm) { this.fileName = fileName; @@ -55,7 +54,7 @@ public class DictionaryReader { private void readWords(BufferedReader reader, WordProcessor wordProcessor) throws IOException { String s = reader.readLine(); - int count = Integer.valueOf(s); + int count = Integer.parseInt(s); int actual = 0; for (int i = 0; i < count; i++) { s = reader.readLine(); @@ -79,7 +78,7 @@ public class DictionaryReader { String wordBase = wd[0].toLowerCase(); if (wordBase.startsWith("-")) return null; wordBase = "#".equals(wordBase) ? "" : wordBase; - List models = wordsFlexias.get(Integer.valueOf(wd[1])); + List models = wordsFlexias.get(Integer.parseInt(wd[1])); FlexiaModel flexiaModel = models.get(0); if (models.size() == 0 || ignoredForm.contains(flexiaModel.getCode())) { return null; @@ -96,7 +95,7 @@ public class DictionaryReader { private void skipBlock(BufferedReader reader) throws IOException { String s = reader.readLine(); - int count = Integer.valueOf(s); + int count = Integer.parseInt(s); for (int i = 0; i < count; i++) { reader.readLine(); } @@ -105,7 +104,7 @@ public class DictionaryReader { private void readPrefix(BufferedReader reader) throws IOException { String s = reader.readLine(); - int count = Integer.valueOf(s); + int count = Integer.parseInt(s); for (int i = 0; i < count; i++) { reader.readLine(); } @@ -113,10 +112,10 @@ public class DictionaryReader { private void readFlexias(BufferedReader reader) throws IOException { String s = reader.readLine(); - int count = Integer.valueOf(s); + int count = Integer.parseInt(s); for (int i = 0; i < count; i++) { s = reader.readLine(); - ArrayList flexiaModelArrayList = new ArrayList(); + ArrayList flexiaModelArrayList = new ArrayList<>(); wordsFlexias.add(flexiaModelArrayList); for (String line : s.split("%")) { addFlexia(flexiaModelArrayList, line); diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java index 6906a61..510f119 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/FlexiaModel.java @@ -16,6 +16,8 @@ package org.apache.lucene.morphology.dictionary; +import java.util.Objects; + /** * Represent information of how word form created form it imutible part. */ @@ -74,11 +76,9 @@ public class FlexiaModel { FlexiaModel that = (FlexiaModel) o; - if (code != null ? !code.equals(that.code) : that.code != null) return false; - if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false; - if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false; - - return true; + if (!Objects.equals(code, that.code)) return false; + if (!Objects.equals(prefix, that.prefix)) return false; + return Objects.equals(suffix, that.suffix); } @Override diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/GrammarReader.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/GrammarReader.java index d56da6a..43f2ca2 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/GrammarReader.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/GrammarReader.java @@ -29,8 +29,8 @@ import java.util.Map; public class GrammarReader { private String fileName; private String fileEncoding = "windows-1251"; - private List grammarInfo = new ArrayList(); - private Map inverseIndex = new HashMap(); + private List grammarInfo = new ArrayList<>(); + private Map inverseIndex = new HashMap<>(); public GrammarReader(String fileName) throws IOException { this.fileName = fileName; @@ -50,7 +50,7 @@ public class GrammarReader { line = line.trim(); if (!line.startsWith("//") && line.length() > 0) { String[] strings = line.split(" ", 2); - Integer i = grammarInfo.size(); + int i = grammarInfo.size(); inverseIndex.put(strings[0], i); grammarInfo.add(i, strings[1]); } @@ -63,7 +63,7 @@ public class GrammarReader { } public String[] getGrammarInfoAsArray() { - return grammarInfo.toArray(new String[grammarInfo.size()]); + return grammarInfo.toArray(new String[0]); } public Map getGrammarInverseIndex() { diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/RemoveFlexiaWithPrefixes.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/RemoveFlexiaWithPrefixes.java index 612896f..cf96823 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/RemoveFlexiaWithPrefixes.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/RemoveFlexiaWithPrefixes.java @@ -15,7 +15,7 @@ */ package org.apache.lucene.morphology.dictionary; -import java.util.Arrays; +import java.util.Collections; import java.util.LinkedList; import java.util.List; @@ -29,7 +29,7 @@ public class RemoveFlexiaWithPrefixes extends WordFilter { @Override public List transform(WordCard wordCard) { - List flexiaModelsToRemove = new LinkedList(); + List flexiaModelsToRemove = new LinkedList<>(); for (FlexiaModel fm : wordCard.getWordsForms()) { if (fm.getPrefix().length() > 0) { flexiaModelsToRemove.add(fm); @@ -39,6 +39,6 @@ public class RemoveFlexiaWithPrefixes extends WordFilter { wordCard.removeFlexia(fm); } - return new LinkedList(Arrays.asList(wordCard)); + return new LinkedList<>(Collections.singletonList(wordCard)); } } diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/RussianAdvSplitterFilter.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/RussianAdvSplitterFilter.java index 6818190..cf07b77 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/RussianAdvSplitterFilter.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/RussianAdvSplitterFilter.java @@ -32,13 +32,13 @@ public class RussianAdvSplitterFilter extends WordFilter { @Override public List transform(WordCard wordCard) { - LinkedList result = new LinkedList(); + LinkedList result = new LinkedList<>(); result.add(wordCard); String baseWord = ""; String canonicalForm = ""; String canonicalSuffix = ""; - List flexiaModels = new LinkedList(); + List flexiaModels = new LinkedList<>(); for (FlexiaModel flexiaModel : wordCard.getWordsForms()) { if (flexiaModel.getPrefix().length() > 0) { flexiaModels.add(new FlexiaModel(flexiaModel.getCode(), flexiaModel.getSuffix(), "")); diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/StatisticsCollector.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/StatisticsCollector.java index be65edc..9cf3882 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/StatisticsCollector.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/StatisticsCollector.java @@ -27,9 +27,9 @@ import java.util.*; //todo made refactoring this class public class StatisticsCollector implements WordProcessor { - private TreeMap> inverseIndex = new TreeMap>(); - private Map, Integer> ruleInverseIndex = new HashMap, Integer>(); - private List> rules = new ArrayList>(); + private TreeMap> inverseIndex = new TreeMap<>(); + private Map, Integer> ruleInverseIndex = new HashMap<>(); + private List> rules = new ArrayList<>(); private GrammarReader grammarReader; private LetterDecoderEncoder decoderEncoder; @@ -39,18 +39,14 @@ public class StatisticsCollector implements WordProcessor { this.decoderEncoder = decoderEncoder; } - public void process(WordCard wordCard) throws IOException { + public void process(WordCard wordCard) { cleanWordCard(wordCard); String normalStringMorph = wordCard.getWordsForms().get(0).getCode(); for (FlexiaModel fm : wordCard.getWordsForms()) { Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph); String form = revertWord(fm.create(wordCard.getBase())); - Set suffixHeuristics = inverseIndex.get(form); - if (suffixHeuristics == null) { - suffixHeuristics = new HashSet(); - inverseIndex.put(form, suffixHeuristics); - } + Set suffixHeuristics = inverseIndex.computeIfAbsent(form, k -> new HashSet<>()); suffixHeuristics.add(heuristic); } } @@ -69,7 +65,7 @@ public class StatisticsCollector implements WordProcessor { public void saveHeuristic(String fileName) throws IOException { - Map dist = new TreeMap(); + Map dist = new TreeMap<>(); Set prevSet = null; int count = 0; for (String key : inverseIndex.keySet()) { @@ -120,11 +116,11 @@ public class StatisticsCollector implements WordProcessor { } private String revertWord(String s) { - String result = ""; + StringBuilder result = new StringBuilder(); for (int i = 1; i <= s.length(); i++) { - result += s.charAt(s.length() - i); + result.append(s.charAt(s.length() - i)); } - return result; + return result.toString(); } @@ -132,15 +128,15 @@ public class StatisticsCollector implements WordProcessor { String form = fm.create(wordBase); String normalForm = wordBase + canonicalSuffix; Integer length = getCommonLength(form, normalForm); - Integer actualSuffixLengh = form.length() - length; + int actualSuffixLengh = form.length() - length; String actualNormalSuffix = normalForm.substring(length); Integer integer = grammarReader.getGrammarInverseIndex().get(fm.getCode()); Integer nf = grammarReader.getGrammarInverseIndex().get(normalSuffixForm); - return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue()); + return new Heuristic((byte) actualSuffixLengh, actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue()); } public static Integer getCommonLength(String s1, String s2) { - Integer length = Math.min(s1.length(), s2.length()); + int length = Math.min(s1.length(), s2.length()); for (int i = 0; i < length; i++) { if (s1.charAt(i) != s2.charAt(i)) return i; } diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCard.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCard.java index 7d10229..2f958a5 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCard.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCard.java @@ -26,7 +26,7 @@ public class WordCard { private String canonicalForm; private String base; private String canonicalSuffix; - private List wordsForms = new ArrayList(); + private List wordsForms = new ArrayList<>(); public WordCard(String canonicalForm, String base, String canonicalSuffix) { this.canonicalForm = canonicalForm; diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCleaner.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCleaner.java index 4e6ae45..6fa6484 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCleaner.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordCleaner.java @@ -17,7 +17,6 @@ package org.apache.lucene.morphology.dictionary; import org.apache.lucene.morphology.LetterDecoderEncoder; -import java.util.Arrays; import java.util.Collections; import java.util.LinkedList; import java.util.List; @@ -38,7 +37,7 @@ public class WordCleaner extends WordFilter { if (word.contains("-")) return Collections.emptyList(); if (!decoderEncoder.checkString(word)) return Collections.emptyList(); - List flexiaModelsToRemove = new LinkedList(); + List flexiaModelsToRemove = new LinkedList<>(); for (FlexiaModel fm : wordCard.getWordsForms()) { if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) { flexiaModelsToRemove.add(fm); @@ -48,6 +47,6 @@ public class WordCleaner extends WordFilter { wordCard.removeFlexia(fm); } - return new LinkedList(Arrays.asList(wordCard)); + return new LinkedList<>(Collections.singletonList(wordCard)); } } diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordProcessor.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordProcessor.java index 709bc7a..69cd9fe 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordProcessor.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordProcessor.java @@ -23,5 +23,5 @@ import java.io.IOException; */ public interface WordProcessor { - public void process(WordCard wordCard) throws IOException; + void process(WordCard wordCard) throws IOException; } diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordStringCleaner.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordStringCleaner.java index d6f2c2e..3abb28c 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordStringCleaner.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/dictionary/WordStringCleaner.java @@ -17,7 +17,7 @@ package org.apache.lucene.morphology.dictionary; import org.apache.lucene.morphology.LetterDecoderEncoder; -import java.util.Arrays; +import java.util.Collections; import java.util.LinkedList; import java.util.List; @@ -42,7 +42,7 @@ public class WordStringCleaner extends WordFilter { //made correct code m.setCode(m.getCode().substring(0, 2)); } - return new LinkedList(Arrays.asList(wordCard)); + return new LinkedList<>(Collections.singletonList(wordCard)); } diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java index db65d15..5370d31 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/EnglishHeuristicBuilder.java @@ -29,7 +29,7 @@ public class EnglishHeuristicBuilder { GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/egramtab.tab"); EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder(); - DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet()); + DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<>()); StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder); WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector); @@ -39,4 +39,4 @@ public class EnglishHeuristicBuilder { statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info"); } -} \ No newline at end of file +} diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java index 9c68d88..2d1273d 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morphology/generator/RussianHeuristicBuilder.java @@ -28,7 +28,7 @@ public class RussianHeuristicBuilder { GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/rgramtab.tab"); RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder(); - DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet()); + DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<>()); StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder); WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector); diff --git a/dictionary-reader/src/test/java/org/apache/lucene/TestAllWords.java b/dictionary-reader/src/test/java/org/apache/lucene/TestAllWords.java index 7ab67db..f58e5d9 100644 --- a/dictionary-reader/src/test/java/org/apache/lucene/TestAllWords.java +++ b/dictionary-reader/src/test/java/org/apache/lucene/TestAllWords.java @@ -23,6 +23,7 @@ import org.apache.lucene.morphology.english.EnglishMorphology; import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder; import org.apache.lucene.morphology.russian.RussianLuceneMorphology; import org.apache.lucene.morphology.russian.RussianMorphology; +import org.hamcrest.MatcherAssert; import org.junit.Before; import org.junit.Test; @@ -33,7 +34,6 @@ import java.util.Map; import java.util.concurrent.atomic.AtomicLong; import static org.hamcrest.Matchers.hasItem; -import static org.junit.Assert.assertThat; public class TestAllWords { @@ -73,21 +73,19 @@ public class TestAllWords { final List morphInfo = grammarInfo.getGrammarInfo(); final Map inversIndex = grammarInfo.getGrammarInverseIndex(); - DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet()); + DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet<>()); final AtomicLong wordCount = new AtomicLong(0); - Long startTime = System.currentTimeMillis(); + long startTime = System.currentTimeMillis(); - WordProcessor wordProcessor = new WordProcessor() { - public void process(WordCard wordCard) throws IOException { - String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); - for (FlexiaModel fm : wordCard.getWordsForms()) { - String wordForm = wordCard.getBase() + fm.getSuffix(); - String morph = morphInfo.get(inversIndex.get(fm.getCode())); - assertThat(morphology.getMorphInfo(wordForm), hasItem(word + "|" + morph)); - assertThat(morphology.getNormalForms(wordForm), hasItem(word)); - wordCount.set(2L + wordCount.get()); - } + WordProcessor wordProcessor = wordCard -> { + String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); + for (FlexiaModel fm : wordCard.getWordsForms()) { + String wordForm = wordCard.getBase() + fm.getSuffix(); + String morph = morphInfo.get(inversIndex.get(fm.getCode())); + MatcherAssert.assertThat(morphology.getMorphInfo(wordForm), hasItem(word + "|" + morph)); + MatcherAssert.assertThat(morphology.getNormalForms(wordForm), hasItem(word)); + wordCount.set(2L + wordCount.get()); } }; @@ -123,17 +121,15 @@ public class TestAllWords { private void testAllWordForLucene(final LuceneMorphology morphology, LetterDecoderEncoder decoderEncoder, String pathToDic) throws IOException { final AtomicLong wordCount = new AtomicLong(0); - Long startTime = System.currentTimeMillis(); + long startTime = System.currentTimeMillis(); - DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet()); - WordProcessor wordProcessor = new WordProcessor() { - public void process(WordCard wordCard) throws IOException { - String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); - for (FlexiaModel fm : wordCard.getWordsForms()) { - String wordForm = wordCard.getBase() + fm.getSuffix(); - assertThat(morphology.getNormalForms(wordForm), hasItem(word)); - wordCount.set(1L + wordCount.get()); - } + DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<>()); + WordProcessor wordProcessor = wordCard -> { + String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); + for (FlexiaModel fm : wordCard.getWordsForms()) { + String wordForm = wordCard.getBase() + fm.getSuffix(); + MatcherAssert.assertThat(morphology.getNormalForms(wordForm), hasItem(word)); + wordCount.set(1L + wordCount.get()); } }; diff --git a/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java b/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java index 02475ae..727b8c9 100644 --- a/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java +++ b/dictionary-reader/src/test/java/org/apache/lucene/morphology/AnalyzersTest.java @@ -31,9 +31,11 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.morphology.english.EnglishLuceneMorphology; import org.apache.lucene.morphology.russian.RussianAnalyzer; import org.apache.lucene.morphology.russian.RussianLuceneMorphology; +import org.hamcrest.MatcherAssert; import org.junit.Test; import java.io.*; +import java.nio.charset.StandardCharsets; import java.util.*; import static org.hamcrest.Matchers.equalTo; @@ -65,24 +67,24 @@ public class AnalyzersTest extends BaseTokenStreamTestCase { LuceneMorphology englishLuceneMorphology = new EnglishLuceneMorphology(); MorphologyAnalyzer russianAnalyzer = new MorphologyAnalyzer(russianLuceneMorphology); - InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("тест пм тест".getBytes()), "UTF-8"); + InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("тест пм тест".getBytes()), StandardCharsets.UTF_8); TokenStream stream = russianAnalyzer.tokenStream(null, reader); MorphologyFilter englishFilter = new MorphologyFilter(stream, englishLuceneMorphology); englishFilter.reset(); while (englishFilter.incrementToken()) { - System.out.println(englishFilter.toString()); + System.out.println(englishFilter); } } @Test public void shouldProvideCorrectIndentForWordWithMelitaForm() throws IOException { Analyzer morphlogyAnalyzer = new RussianAnalyzer(); - InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), "UTF-8"); + InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), StandardCharsets.UTF_8); TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader); tokenStream.reset(); - Set foromsOfWine = new HashSet(); + Set foromsOfWine = new HashSet<>(); foromsOfWine.add("вина"); foromsOfWine.add("винo"); boolean wordSeen = false; @@ -90,7 +92,7 @@ public class AnalyzersTest extends BaseTokenStreamTestCase { CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class); if(foromsOfWine.contains(charTerm.toString()) && wordSeen){ - assertThat(position.getPositionIncrement(),equalTo(0)); + MatcherAssert.assertThat(position.getPositionIncrement(),equalTo(0)); } if(foromsOfWine.contains(charTerm.toString())){ wordSeen = true; @@ -100,18 +102,18 @@ public class AnalyzersTest extends BaseTokenStreamTestCase { private void testAnalayzer(Analyzer morphlogyAnalyzer, String answerPath, String testPath) throws IOException { InputStream stream = this.getClass().getResourceAsStream(answerPath); - BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); + BufferedReader breader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" "); - HashSet answer = new HashSet(Arrays.asList(strings)); + HashSet answer = new HashSet<>(Arrays.asList(strings)); stream.close(); stream = this.getClass().getResourceAsStream(testPath); - InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); + InputStreamReader reader = new InputStreamReader(stream, StandardCharsets.UTF_8); TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader); tokenStream.reset(); - HashSet result = new HashSet(); + HashSet result = new HashSet<>(); while (tokenStream.incrementToken()) { CharTermAttribute attribute1 = tokenStream.getAttribute(CharTermAttribute.class); result.add(attribute1.toString()); @@ -119,7 +121,7 @@ public class AnalyzersTest extends BaseTokenStreamTestCase { stream.close(); - assertThat(result, equalTo(answer)); + MatcherAssert.assertThat(result, equalTo(answer)); } @Test diff --git a/dictionary-reader/src/test/java/org/apache/lucene/morphology/LuceneMorphTest.java b/dictionary-reader/src/test/java/org/apache/lucene/morphology/LuceneMorphTest.java index 838a7e7..c6d5899 100644 --- a/dictionary-reader/src/test/java/org/apache/lucene/morphology/LuceneMorphTest.java +++ b/dictionary-reader/src/test/java/org/apache/lucene/morphology/LuceneMorphTest.java @@ -17,19 +17,20 @@ package org.apache.lucene.morphology; import org.apache.lucene.morphology.russian.RussianLuceneMorphology; import org.apache.lucene.morphology.english.EnglishLuceneMorphology; +import org.hamcrest.MatcherAssert; import org.junit.Test; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; import static org.hamcrest.CoreMatchers.equalTo; -import static org.junit.Assert.assertThat; public class LuceneMorphTest { @@ -52,14 +53,13 @@ public class LuceneMorphTest { private void testMorphology(LuceneMorphology luceneMorph, String pathToTestData) throws IOException { InputStream stream = this.getClass().getResourceAsStream(pathToTestData); - BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); String s = bufferedReader.readLine(); while (s != null) { String[] qa = s.trim().split(" "); - Set result = new HashSet(); - result.addAll(Arrays.asList(qa).subList(1, qa.length)); - Set stringList = new HashSet(luceneMorph.getNormalForms(qa[0])); - assertThat(stringList, equalTo(result)); + Set result = new HashSet<>(Arrays.asList(qa).subList(1, qa.length)); + Set stringList = new HashSet<>(luceneMorph.getNormalForms(qa[0])); + MatcherAssert.assertThat(stringList, equalTo(result)); s = bufferedReader.readLine(); } } diff --git a/english/pom.xml b/english/pom.xml index 480eedb..64bac77 100644 --- a/english/pom.xml +++ b/english/pom.xml @@ -6,7 +6,6 @@ 1.5 4.0.0 - org.apache.lucene.morphology english english 1.5 @@ -20,4 +19,4 @@ - \ No newline at end of file + diff --git a/english/src/main/java/org/apache/lucene/morphology/english/EnglishLetterDecoderEncoder.java b/english/src/main/java/org/apache/lucene/morphology/english/EnglishLetterDecoderEncoder.java index 46b0f85..9f12a9f 100644 --- a/english/src/main/java/org/apache/lucene/morphology/english/EnglishLetterDecoderEncoder.java +++ b/english/src/main/java/org/apache/lucene/morphology/english/EnglishLetterDecoderEncoder.java @@ -32,7 +32,7 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder { if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12); int result = 0; for (int i = 0; i < string.length(); i++) { - int c = 0 + string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET; + int c = string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET; if (c == 45 - ENGLISH_SMALL_LETTER_OFFSET) { c = DASH_CODE; } @@ -48,7 +48,7 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder { public int[] encodeToArray(String s) { - ArrayList integers = new ArrayList(); + ArrayList integers = new ArrayList<>(); while (s.length() > 6) { integers.add(encode(s.substring(0, 6))); s = s.substring(6); @@ -64,16 +64,16 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder { } public String decodeArray(int[] array) { - String result = ""; + StringBuilder result = new StringBuilder(); for (int i : array) { - result += decode(i); + result.append(decode(i)); } - return result; + return result.toString(); } public String decode(Integer suffixN) { - String result = ""; + StringBuilder result = new StringBuilder(); while (suffixN > 27) { int c = suffixN % 28 + ENGLISH_SMALL_LETTER_OFFSET; if (c == ENGLISH_SMALL_LETTER_OFFSET) { @@ -81,21 +81,20 @@ public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder { continue; } if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR; - result = (char) c + result; + result.insert(0, (char) c); suffixN /= 28; } long c = suffixN + ENGLISH_SMALL_LETTER_OFFSET; if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR; - result = (char) c + result; - return result; + result.insert(0, (char) c); + return result.toString(); } public boolean checkCharacter(char c) { - int code = 0 + c; + int code = c; if (code == 45) return true; code -= ENGLISH_SMALL_LETTER_OFFSET; - if (code > 0 && code < 27) return true; - return false; + return code > 0 && code < 27; } diff --git a/english/src/test/java/org/apache/lucene/morphology/english/EnglishLetterDecoderEncoderTest.java b/english/src/test/java/org/apache/lucene/morphology/english/EnglishLetterDecoderEncoderTest.java index bc6f872..55c0aa4 100644 --- a/english/src/test/java/org/apache/lucene/morphology/english/EnglishLetterDecoderEncoderTest.java +++ b/english/src/test/java/org/apache/lucene/morphology/english/EnglishLetterDecoderEncoderTest.java @@ -16,7 +16,8 @@ package org.apache.lucene.morphology.english; import static org.hamcrest.core.IsEqual.equalTo; -import static org.junit.Assert.assertThat; + +import org.hamcrest.MatcherAssert; import org.junit.Before; @@ -30,11 +31,11 @@ public class EnglishLetterDecoderEncoderTest { @org.junit.Test public void testDecodeEncodeToArray() { - assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("abcdefghijklmnopqrstuvwxyz")), equalTo("abcdefghijklmnopqrstuvwxyz")); - assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("xyz")), equalTo("xyz")); - assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrty")), equalTo("ytrrty")); - assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyz")), equalTo("ytrrtyz")); - assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyzqwqwe")), equalTo("ytrrtyzqwqwe")); + MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("abcdefghijklmnopqrstuvwxyz")), equalTo("abcdefghijklmnopqrstuvwxyz")); + MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("xyz")), equalTo("xyz")); + MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrty")), equalTo("ytrrty")); + MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyz")), equalTo("ytrrtyz")); + MatcherAssert.assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyzqwqwe")), equalTo("ytrrtyzqwqwe")); } } diff --git a/english/src/test/java/org/apache/lucene/morphology/english/stemmer/EnglishStemmerTest.java b/english/src/test/java/org/apache/lucene/morphology/english/stemmer/EnglishStemmerTest.java index 7899279..134857c 100644 --- a/english/src/test/java/org/apache/lucene/morphology/english/stemmer/EnglishStemmerTest.java +++ b/english/src/test/java/org/apache/lucene/morphology/english/stemmer/EnglishStemmerTest.java @@ -16,9 +16,9 @@ package org.apache.lucene.morphology.english.stemmer; import org.apache.lucene.morphology.english.EnglishLuceneMorphology; +import org.hamcrest.MatcherAssert; import org.junit.Test; import static org.hamcrest.core.IsEqual.equalTo; -import static org.junit.Assert.assertThat; public class EnglishStemmerTest { @@ -26,24 +26,24 @@ public class EnglishStemmerTest { public void testGetStemmedWord() throws Exception { EnglishLuceneMorphology englishLuceneMorphology = new EnglishLuceneMorphology(); EnglishStemmer englishStemmer = new EnglishStemmer(englishLuceneMorphology); - assertThat(englishStemmer.getStemmedWord("running"),equalTo("run")); - assertThat(englishStemmer.getStemmedWord("run"),equalTo("run")); - assertThat(englishStemmer.getStemmedWord("killed"),equalTo("kill")); - assertThat(englishStemmer.getStemmedWord("kill"),equalTo("kill")); - assertThat(englishStemmer.getStemmedWord("networking"),equalTo("network")); - assertThat(englishStemmer.getStemmedWord("network"),equalTo("network")); - assertThat(englishStemmer.getStemmedWord("statistics"),equalTo("statistic")); - assertThat(englishStemmer.getStemmedWord("statistic"),equalTo("statistic")); - assertThat(englishStemmer.getStemmedWord("stats"),equalTo("stat")); - assertThat(englishStemmer.getStemmedWord("stat"),equalTo("stat")); - assertThat(englishStemmer.getStemmedWord("countries"),equalTo("country")); - assertThat(englishStemmer.getStemmedWord("country"),equalTo("country")); - assertThat(englishStemmer.getStemmedWord("delete"),equalTo("delete")); - assertThat(englishStemmer.getStemmedWord("ended"),equalTo("end")); - assertThat(englishStemmer.getStemmedWord("end"),equalTo("end")); - assertThat(englishStemmer.getStemmedWord("ends"),equalTo("end")); - assertThat(englishStemmer.getStemmedWord("given"),equalTo("give")); - assertThat(englishStemmer.getStemmedWord("give"),equalTo("give")); - assertThat(englishStemmer.getStemmedWord("log4j"),equalTo("log4j")); + MatcherAssert.assertThat(englishStemmer.getStemmedWord("running"),equalTo("run")); + MatcherAssert.assertThat(englishStemmer.getStemmedWord("run"),equalTo("run")); + MatcherAssert.assertThat(englishStemmer.getStemmedWord("killed"),equalTo("kill")); + MatcherAssert.assertThat(englishStemmer.getStemmedWord("kill"),equalTo("kill")); + MatcherAssert.assertThat(englishStemmer.getStemmedWord("networking"),equalTo("network")); + MatcherAssert.assertThat(englishStemmer.getStemmedWord("network"),equalTo("network")); + MatcherAssert.assertThat(englishStemmer.getStemmedWord("statistics"),equalTo("statistic")); + MatcherAssert.assertThat(englishStemmer.getStemmedWord("statistic"),equalTo("statistic")); + MatcherAssert.assertThat(englishStemmer.getStemmedWord("stats"),equalTo("stat")); + MatcherAssert.assertThat(englishStemmer.getStemmedWord("stat"),equalTo("stat")); + MatcherAssert.assertThat(englishStemmer.getStemmedWord("countries"),equalTo("country")); + MatcherAssert.assertThat(englishStemmer.getStemmedWord("country"),equalTo("country")); + MatcherAssert.assertThat(englishStemmer.getStemmedWord("delete"),equalTo("delete")); + MatcherAssert.assertThat(englishStemmer.getStemmedWord("ended"),equalTo("end")); + MatcherAssert.assertThat(englishStemmer.getStemmedWord("end"),equalTo("end")); + MatcherAssert.assertThat(englishStemmer.getStemmedWord("ends"),equalTo("end")); + MatcherAssert.assertThat(englishStemmer.getStemmedWord("given"),equalTo("give")); + MatcherAssert.assertThat(englishStemmer.getStemmedWord("give"),equalTo("give")); + MatcherAssert.assertThat(englishStemmer.getStemmedWord("log4j"),equalTo("log4j")); } } diff --git a/morph/pom.xml b/morph/pom.xml index b3b0576..9e40c10 100644 --- a/morph/pom.xml +++ b/morph/pom.xml @@ -6,7 +6,6 @@ 1.5 4.0.0 - org.apache.lucene.morphology morph morph 1.5 diff --git a/morph/src/main/java/org/apache/lucene/morphology/BaseLetterDecoderEncoder.java b/morph/src/main/java/org/apache/lucene/morphology/BaseLetterDecoderEncoder.java index 0598d92..3bfd60a 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/BaseLetterDecoderEncoder.java +++ b/morph/src/main/java/org/apache/lucene/morphology/BaseLetterDecoderEncoder.java @@ -21,7 +21,7 @@ import java.util.ArrayList; public abstract class BaseLetterDecoderEncoder implements LetterDecoderEncoder { public int[] encodeToArray(String s) { - ArrayList integers = new ArrayList(); + ArrayList integers = new ArrayList<>(); while (s.length() > 6) { integers.add(encode(s.substring(0, 6))); s = s.substring(6); @@ -37,11 +37,11 @@ public abstract class BaseLetterDecoderEncoder implements LetterDecoderEncoder { } public String decodeArray(int[] array) { - String result = ""; + StringBuilder result = new StringBuilder(); for (int i : array) { - result += decode(i); + result.append(decode(i)); } - return result; + return result.toString(); } public boolean checkString(String word) { diff --git a/morph/src/main/java/org/apache/lucene/morphology/Heuristic.java b/morph/src/main/java/org/apache/lucene/morphology/Heuristic.java index be8ea82..8fe5d62 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/Heuristic.java +++ b/morph/src/main/java/org/apache/lucene/morphology/Heuristic.java @@ -16,6 +16,7 @@ package org.apache.lucene.morphology; import java.io.Serializable; +import java.util.Objects; public class Heuristic implements Serializable { @@ -26,10 +27,10 @@ public class Heuristic implements Serializable { public Heuristic(String s) { String[] strings = s.split("\\|"); - actualSuffixLength = Byte.valueOf(strings[0]); + actualSuffixLength = Byte.parseByte(strings[0]); actualNormalSuffix = strings[1]; - formMorphInfo = Short.valueOf(strings[2]); - normalFormMorphInfo = Short.valueOf(strings[3]); + formMorphInfo = Short.parseShort(strings[2]); + normalFormMorphInfo = Short.parseShort(strings[3]); } public Heuristic(byte actualSuffixLength, String actualNormalSuffix, short formMorphInfo, short normalFormMorphInfo) { @@ -70,15 +71,12 @@ public class Heuristic implements Serializable { if (actualSuffixLength != heuristic.actualSuffixLength) return false; if (formMorphInfo != heuristic.formMorphInfo) return false; if (normalFormMorphInfo != heuristic.normalFormMorphInfo) return false; - if (actualNormalSuffix != null ? !actualNormalSuffix.equals(heuristic.actualNormalSuffix) : heuristic.actualNormalSuffix != null) - return false; - - return true; + return Objects.equals(actualNormalSuffix, heuristic.actualNormalSuffix); } @Override public int hashCode() { - int result = (int) actualSuffixLength; + int result = actualSuffixLength; result = 31 * result + (actualNormalSuffix != null ? actualNormalSuffix.hashCode() : 0); result = 31 * result + (int) formMorphInfo; result = 31 * result + (int) normalFormMorphInfo; diff --git a/morph/src/main/java/org/apache/lucene/morphology/LetterDecoderEncoder.java b/morph/src/main/java/org/apache/lucene/morphology/LetterDecoderEncoder.java index 7e186e1..91ff9ed 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/LetterDecoderEncoder.java +++ b/morph/src/main/java/org/apache/lucene/morphology/LetterDecoderEncoder.java @@ -17,17 +17,17 @@ package org.apache.lucene.morphology; public interface LetterDecoderEncoder { - public Integer encode(String string); + Integer encode(String string); - public int[] encodeToArray(String s); + int[] encodeToArray(String s); - public String decodeArray(int[] array); + String decodeArray(int[] array); - public String decode(Integer suffixN); + String decode(Integer suffixN); - public boolean checkCharacter(char c); + boolean checkCharacter(char c); - public boolean checkString(String word); + boolean checkString(String word); - public String cleanString(String s); + String cleanString(String s); } diff --git a/morph/src/main/java/org/apache/lucene/morphology/LuceneMorphology.java b/morph/src/main/java/org/apache/lucene/morphology/LuceneMorphology.java index 7e97b6b..96b4cc4 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/LuceneMorphology.java +++ b/morph/src/main/java/org/apache/lucene/morphology/LuceneMorphology.java @@ -34,13 +34,13 @@ public class LuceneMorphology extends MorphologyImpl { protected void readRules(BufferedReader bufferedReader) throws IOException { String s; - Integer amount; + int amount; s = bufferedReader.readLine(); - amount = Integer.valueOf(s); + amount = Integer.parseInt(s); rules = new Heuristic[amount][]; for (int i = 0; i < amount; i++) { String s1 = bufferedReader.readLine(); - Integer ruleLenght = Integer.valueOf(s1); + int ruleLenght = Integer.parseInt(s1); Heuristic[] heuristics = new Heuristic[ruleLenght]; for (int j = 0; j < ruleLenght; j++) { heuristics[j] = new Heuristic(bufferedReader.readLine()); @@ -51,7 +51,7 @@ public class LuceneMorphology extends MorphologyImpl { private Heuristic[] modeifyHeuristic(Heuristic[] heuristics) { - ArrayList result = new ArrayList(); + ArrayList result = new ArrayList<>(); for (Heuristic heuristic : heuristics) { boolean isAdded = true; for (Heuristic ch : result) { @@ -61,7 +61,7 @@ public class LuceneMorphology extends MorphologyImpl { result.add(heuristic); } } - return result.toArray(new Heuristic[result.size()]); + return result.toArray(new Heuristic[0]); } public boolean checkString(String s) { diff --git a/morph/src/main/java/org/apache/lucene/morphology/MorphologyImpl.java b/morph/src/main/java/org/apache/lucene/morphology/MorphologyImpl.java index 9a12d2b..7ed7a0c 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/MorphologyImpl.java +++ b/morph/src/main/java/org/apache/lucene/morphology/MorphologyImpl.java @@ -17,6 +17,7 @@ package org.apache.lucene.morphology; import java.io.*; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; @@ -47,7 +48,7 @@ public class MorphologyImpl implements Morphology { } public List getNormalForms(String s) { - ArrayList result = new ArrayList(); + ArrayList result = new ArrayList<>(); int[] ints = decoderEncoder.encodeToArray(revertWord(s)); int ruleId = findRuleId(ints); boolean notSeenEmptyString = true; @@ -64,7 +65,7 @@ public class MorphologyImpl implements Morphology { } public List getMorphInfo(String s) { - ArrayList result = new ArrayList(); + ArrayList result = new ArrayList<>(); int[] ints = decoderEncoder.encodeToArray(revertWord(s)); int ruleId = findRuleId(ints); for (Heuristic h : rules[rulesId[ruleId]]) { @@ -100,14 +101,14 @@ public class MorphologyImpl implements Morphology { private int compareToInts(int[] i1, int[] i2) { int minLength = Math.min(i1.length, i2.length); for (int i = 0; i < minLength; i++) { - int i3 = i1[i] < i2[i] ? -1 : (i1[i] == i2[i] ? 0 : 1); + int i3 = Integer.compare(i1[i], i2[i]); if (i3 != 0) return i3; } return i1.length - i2.length; } public void writeToFile(String fileName) throws IOException { - OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8"); + OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8); writer.write(separators.length + "\n"); for (int[] i : separators) { writer.write(i.length + "\n"); @@ -138,7 +139,7 @@ public class MorphologyImpl implements Morphology { } private void readFromInputStream(InputStream inputStream) throws IOException { - BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8)); String s = bufferedReader.readLine(); Integer amount = Integer.valueOf(s); @@ -153,9 +154,9 @@ public class MorphologyImpl implements Morphology { private void readGrammaInfo(BufferedReader bufferedReader) throws IOException { String s; - Integer amount; + int amount; s = bufferedReader.readLine(); - amount = Integer.valueOf(s); + amount = Integer.parseInt(s); grammarInfo = new String[amount]; for (int i = 0; i < amount; i++) { grammarInfo[i] = bufferedReader.readLine(); @@ -164,13 +165,13 @@ public class MorphologyImpl implements Morphology { protected void readRules(BufferedReader bufferedReader) throws IOException { String s; - Integer amount; + int amount; s = bufferedReader.readLine(); - amount = Integer.valueOf(s); + amount = Integer.parseInt(s); rules = new Heuristic[amount][]; for (int i = 0; i < amount; i++) { String s1 = bufferedReader.readLine(); - Integer ruleLength = Integer.valueOf(s1); + int ruleLength = Integer.parseInt(s1); rules[i] = new Heuristic[ruleLength]; for (int j = 0; j < ruleLength; j++) { rules[i][j] = new Heuristic(bufferedReader.readLine()); @@ -182,7 +183,7 @@ public class MorphologyImpl implements Morphology { rulesId = new short[amount]; for (int i = 0; i < amount; i++) { String s1 = bufferedReader.readLine(); - rulesId[i] = Short.valueOf(s1); + rulesId[i] = Short.parseShort(s1); } } @@ -190,10 +191,10 @@ public class MorphologyImpl implements Morphology { separators = new int[amount][]; for (int i = 0; i < amount; i++) { String s1 = bufferedReader.readLine(); - Integer wordLenght = Integer.valueOf(s1); + int wordLenght = Integer.parseInt(s1); separators[i] = new int[wordLenght]; for (int j = 0; j < wordLenght; j++) { - separators[i][j] = Integer.valueOf(bufferedReader.readLine()); + separators[i][j] = Integer.parseInt(bufferedReader.readLine()); } } } diff --git a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java index 26f2dcb..53401c4 100644 --- a/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java +++ b/morph/src/main/java/org/apache/lucene/morphology/analyzer/MorphologyAnalyzer.java @@ -73,6 +73,6 @@ public class MorphologyAnalyzer extends Analyzer { TokenFilter filter = new LowerCaseFilter(src); filter = new MorphologyFilter(filter, luceneMorph); - return new TokenStreamComponents(r -> src.setReader(r), filter); + return new TokenStreamComponents(src::setReader, filter); } } diff --git a/pom.xml b/pom.xml index e539154..ee2163c 100644 --- a/pom.xml +++ b/pom.xml @@ -16,7 +16,7 @@ - 8.7.0 + 8.11.0 1.5 4.13 @@ -46,7 +46,7 @@ org.hamcrest hamcrest-all - 1.1 + 1.3 test @@ -109,8 +109,8 @@ maven-compiler-plugin 3.8.1 - 1.8 - 1.8 + 11 + 11 @@ -147,7 +147,7 @@ maven-source-plugin - 3.0.1 + 3.2.1 attach-sources @@ -159,7 +159,7 @@ maven-javadoc-plugin - 2.10.4 + 3.3.1 attach-javadocs diff --git a/russian/pom.xml b/russian/pom.xml index c29cb67..d7d7ec2 100644 --- a/russian/pom.xml +++ b/russian/pom.xml @@ -6,7 +6,6 @@ 1.5 4.0.0 - org.apache.lucene.morphology russian russian 1.5 @@ -23,7 +22,7 @@ junit junit - 4.13.1 + ${junit.version} test diff --git a/russian/src/main/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoder.java b/russian/src/main/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoder.java index 802be45..649a3a8 100644 --- a/russian/src/main/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoder.java +++ b/russian/src/main/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoder.java @@ -20,7 +20,6 @@ import org.apache.lucene.morphology.LetterDecoderEncoder; import org.apache.lucene.morphology.SuffixToLongException; import org.apache.lucene.morphology.WrongCharaterException; -import java.util.ArrayList; import java.util.LinkedList; /** @@ -42,7 +41,7 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { throw new SuffixToLongException("Suffix length should not be greater then " + WORD_PART_LENGHT + " " + string); int result = 0; for (int i = 0; i < string.length(); i++) { - int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET; + int c = string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET; if (c == 45 - RUSSIAN_SMALL_LETTER_OFFSET) { c = DASH_CODE; } @@ -58,7 +57,7 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { } public int[] encodeToArray(String s) { - LinkedList integers = new LinkedList(); + LinkedList integers = new LinkedList<>(); while (s.length() > WORD_PART_LENGHT) { integers.add(encode(s.substring(0, WORD_PART_LENGHT))); s = s.substring(WORD_PART_LENGHT); @@ -74,16 +73,16 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { } public String decodeArray(int[] array) { - String result = ""; + StringBuilder result = new StringBuilder(); for (int i : array) { - result += decode(i); + result.append(decode(i)); } - return result; + return result.toString(); } public String decode(Integer suffixN) { - String result = ""; + StringBuilder result = new StringBuilder(); while (suffixN > 33) { int c = suffixN % 34 + RUSSIAN_SMALL_LETTER_OFFSET; if (c == RUSSIAN_SMALL_LETTER_OFFSET) { @@ -91,21 +90,20 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder { continue; } if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR; - result = (char) c + result; + result.insert(0, (char) c); suffixN /= 34; } long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET; if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR; - result = (char) c + result; - return result; + result.insert(0, (char) c); + return result.toString(); } public boolean checkCharacter(char c) { - int code = 0 + c; + int code = c; if (code == 45) return true; code -= RUSSIAN_SMALL_LETTER_OFFSET; - if (code > 0 && code < 33) return true; - return false; + return code > 0 && code < 33; } public boolean checkString(String word) { diff --git a/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoderTest.java b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoderTest.java index 73a2d38..d089c48 100644 --- a/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoderTest.java +++ b/russian/src/test/java/org/apache/lucene/morphology/russian/RussianLetterDecoderEncoderTest.java @@ -17,6 +17,7 @@ package org.apache.lucene.morphology.russian; import org.apache.lucene.morphology.SuffixToLongException; import org.apache.lucene.morphology.WrongCharaterException; +import org.hamcrest.MatcherAssert; import org.junit.Before; import org.junit.Test; @@ -24,9 +25,9 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; import static org.hamcrest.core.IsEqual.equalTo; -import static org.junit.Assert.assertThat; public class RussianLetterDecoderEncoderTest { private RussianLetterDecoderEncoder decoderEncoder; @@ -40,12 +41,12 @@ public class RussianLetterDecoderEncoderTest { @Test public void testShouldPreserverStringComporision() throws IOException { InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-monotonic.txt"); - BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); String s = bufferedReader.readLine(); while (s != null) { String[] qa = s.trim().split(" "); if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT && qa[1].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) { - assertThat(decoderEncoder.encode(qa[1]) > decoderEncoder.encode(qa[0]), equalTo(true)); + MatcherAssert.assertThat(decoderEncoder.encode(qa[1]) > decoderEncoder.encode(qa[0]), equalTo(true)); } s = bufferedReader.readLine(); } @@ -55,13 +56,13 @@ public class RussianLetterDecoderEncoderTest { @Test public void testShouldCorrectDecodeEncode() throws IOException { InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data.txt"); - BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); String s = bufferedReader.readLine(); while (s != null) { String[] qa = s.trim().split(" "); if (qa[0].length() <= RussianLetterDecoderEncoder.WORD_PART_LENGHT) { Integer encodedSuffix = decoderEncoder.encode(qa[0]); - assertThat(decoderEncoder.decode(encodedSuffix), equalTo(qa[1])); + MatcherAssert.assertThat(decoderEncoder.decode(encodedSuffix), equalTo(qa[1])); } s = bufferedReader.readLine(); } @@ -70,12 +71,12 @@ public class RussianLetterDecoderEncoderTest { @Test public void testShouldCorrectDecodeEncodeStringToArray() throws IOException { InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/decoder-test-data-for-array.txt"); - BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); String s = bufferedReader.readLine(); while (s != null) { String[] qa = s.trim().split(" "); int[] ecodedSuffix = decoderEncoder.encodeToArray(qa[0]); - assertThat(decoderEncoder.decodeArray(ecodedSuffix), equalTo(qa[1])); + MatcherAssert.assertThat(decoderEncoder.decodeArray(ecodedSuffix), equalTo(qa[1])); s = bufferedReader.readLine(); } } diff --git a/solr-morphology-analysis/src/test/java/org/apache/lucene/analysis/morphology/MorphologyFilterFactoryTest.java b/solr-morphology-analysis/src/test/java/org/apache/lucene/analysis/morphology/MorphologyFilterFactoryTest.java index 7cb225a..b6c814a 100644 --- a/solr-morphology-analysis/src/test/java/org/apache/lucene/analysis/morphology/MorphologyFilterFactoryTest.java +++ b/solr-morphology-analysis/src/test/java/org/apache/lucene/analysis/morphology/MorphologyFilterFactoryTest.java @@ -30,7 +30,7 @@ import java.util.Map; public class MorphologyFilterFactoryTest { private static final String LANGUAGE_KEY = "language"; - private ResourceLoader loader = new ClasspathResourceLoader(); + private ResourceLoader loader = new ClasspathResourceLoader(MorphologyFilterFactoryTest.class); private Map args; @Before From d4d083ed977a72afa8eb2252f7d8aac8abc1c92f Mon Sep 17 00:00:00 2001 From: Konstantin Perikov Date: Sat, 11 Dec 2021 11:58:01 +0000 Subject: [PATCH 2/2] make ci to run on PR as well --- .github/workflows/main.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 71b2e6c..7bf0260 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -1,6 +1,6 @@ name: Java CI -on: [push] +on: [push, pull_request] jobs: tests: @@ -32,4 +32,4 @@ jobs: - uses: actions/upload-artifact@v2 with: name: artifacts - path: ${{ github.workspace }}/*/target/*.jar \ No newline at end of file + path: ${{ github.workspace }}/*/target/*.jar