update to jdk11, fix migration issues, some typos, deprecated apis, bump up Lucene to 8.11
This commit is contained in:
@ -6,7 +6,6 @@
|
||||
<version>1.5</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>dictionary-reader</artifactId>
|
||||
<name>dictionary-reader</name>
|
||||
<version>1.5</version>
|
||||
|
@ -22,20 +22,19 @@ import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
/**
|
||||
* This class contain logic how read
|
||||
* dictonary and produce word with it all forms.
|
||||
* dictionary and produce word with it all forms.
|
||||
*/
|
||||
public class DictionaryReader {
|
||||
private String fileName;
|
||||
private String fileEncoding = "windows-1251";
|
||||
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
|
||||
private Set<String> ignoredForm = new HashSet<String>();
|
||||
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<>();
|
||||
private Set<String> ignoredForm;
|
||||
|
||||
public DictionaryReader(String fileName, Set<String> ignoredForm) {
|
||||
this.fileName = fileName;
|
||||
@ -55,7 +54,7 @@ public class DictionaryReader {
|
||||
|
||||
private void readWords(BufferedReader reader, WordProcessor wordProcessor) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.valueOf(s);
|
||||
int count = Integer.parseInt(s);
|
||||
int actual = 0;
|
||||
for (int i = 0; i < count; i++) {
|
||||
s = reader.readLine();
|
||||
@ -79,7 +78,7 @@ public class DictionaryReader {
|
||||
String wordBase = wd[0].toLowerCase();
|
||||
if (wordBase.startsWith("-")) return null;
|
||||
wordBase = "#".equals(wordBase) ? "" : wordBase;
|
||||
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
|
||||
List<FlexiaModel> models = wordsFlexias.get(Integer.parseInt(wd[1]));
|
||||
FlexiaModel flexiaModel = models.get(0);
|
||||
if (models.size() == 0 || ignoredForm.contains(flexiaModel.getCode())) {
|
||||
return null;
|
||||
@ -96,7 +95,7 @@ public class DictionaryReader {
|
||||
|
||||
private void skipBlock(BufferedReader reader) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.valueOf(s);
|
||||
int count = Integer.parseInt(s);
|
||||
for (int i = 0; i < count; i++) {
|
||||
reader.readLine();
|
||||
}
|
||||
@ -105,7 +104,7 @@ public class DictionaryReader {
|
||||
|
||||
private void readPrefix(BufferedReader reader) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.valueOf(s);
|
||||
int count = Integer.parseInt(s);
|
||||
for (int i = 0; i < count; i++) {
|
||||
reader.readLine();
|
||||
}
|
||||
@ -113,10 +112,10 @@ public class DictionaryReader {
|
||||
|
||||
private void readFlexias(BufferedReader reader) throws IOException {
|
||||
String s = reader.readLine();
|
||||
int count = Integer.valueOf(s);
|
||||
int count = Integer.parseInt(s);
|
||||
for (int i = 0; i < count; i++) {
|
||||
s = reader.readLine();
|
||||
ArrayList<FlexiaModel> flexiaModelArrayList = new ArrayList<FlexiaModel>();
|
||||
ArrayList<FlexiaModel> flexiaModelArrayList = new ArrayList<>();
|
||||
wordsFlexias.add(flexiaModelArrayList);
|
||||
for (String line : s.split("%")) {
|
||||
addFlexia(flexiaModelArrayList, line);
|
||||
|
@ -16,6 +16,8 @@
|
||||
|
||||
package org.apache.lucene.morphology.dictionary;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Represent information of how word form created form it imutible part.
|
||||
*/
|
||||
@ -74,11 +76,9 @@ public class FlexiaModel {
|
||||
|
||||
FlexiaModel that = (FlexiaModel) o;
|
||||
|
||||
if (code != null ? !code.equals(that.code) : that.code != null) return false;
|
||||
if (prefix != null ? !prefix.equals(that.prefix) : that.prefix != null) return false;
|
||||
if (suffix != null ? !suffix.equals(that.suffix) : that.suffix != null) return false;
|
||||
|
||||
return true;
|
||||
if (!Objects.equals(code, that.code)) return false;
|
||||
if (!Objects.equals(prefix, that.prefix)) return false;
|
||||
return Objects.equals(suffix, that.suffix);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -29,8 +29,8 @@ import java.util.Map;
|
||||
public class GrammarReader {
|
||||
private String fileName;
|
||||
private String fileEncoding = "windows-1251";
|
||||
private List<String> grammarInfo = new ArrayList<String>();
|
||||
private Map<String, Integer> inverseIndex = new HashMap<String, Integer>();
|
||||
private List<String> grammarInfo = new ArrayList<>();
|
||||
private Map<String, Integer> inverseIndex = new HashMap<>();
|
||||
|
||||
public GrammarReader(String fileName) throws IOException {
|
||||
this.fileName = fileName;
|
||||
@ -50,7 +50,7 @@ public class GrammarReader {
|
||||
line = line.trim();
|
||||
if (!line.startsWith("//") && line.length() > 0) {
|
||||
String[] strings = line.split(" ", 2);
|
||||
Integer i = grammarInfo.size();
|
||||
int i = grammarInfo.size();
|
||||
inverseIndex.put(strings[0], i);
|
||||
grammarInfo.add(i, strings[1]);
|
||||
}
|
||||
@ -63,7 +63,7 @@ public class GrammarReader {
|
||||
}
|
||||
|
||||
public String[] getGrammarInfoAsArray() {
|
||||
return grammarInfo.toArray(new String[grammarInfo.size()]);
|
||||
return grammarInfo.toArray(new String[0]);
|
||||
}
|
||||
|
||||
public Map<String, Integer> getGrammarInverseIndex() {
|
||||
|
@ -15,7 +15,7 @@
|
||||
*/
|
||||
package org.apache.lucene.morphology.dictionary;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
@ -29,7 +29,7 @@ public class RemoveFlexiaWithPrefixes extends WordFilter {
|
||||
@Override
|
||||
public List<WordCard> transform(WordCard wordCard) {
|
||||
|
||||
List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>();
|
||||
List<FlexiaModel> flexiaModelsToRemove = new LinkedList<>();
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
if (fm.getPrefix().length() > 0) {
|
||||
flexiaModelsToRemove.add(fm);
|
||||
@ -39,6 +39,6 @@ public class RemoveFlexiaWithPrefixes extends WordFilter {
|
||||
wordCard.removeFlexia(fm);
|
||||
}
|
||||
|
||||
return new LinkedList<WordCard>(Arrays.asList(wordCard));
|
||||
return new LinkedList<>(Collections.singletonList(wordCard));
|
||||
}
|
||||
}
|
||||
|
@ -32,13 +32,13 @@ public class RussianAdvSplitterFilter extends WordFilter {
|
||||
|
||||
@Override
|
||||
public List<WordCard> transform(WordCard wordCard) {
|
||||
LinkedList<WordCard> result = new LinkedList<WordCard>();
|
||||
LinkedList<WordCard> result = new LinkedList<>();
|
||||
result.add(wordCard);
|
||||
|
||||
String baseWord = "";
|
||||
String canonicalForm = "";
|
||||
String canonicalSuffix = "";
|
||||
List<FlexiaModel> flexiaModels = new LinkedList<FlexiaModel>();
|
||||
List<FlexiaModel> flexiaModels = new LinkedList<>();
|
||||
for (FlexiaModel flexiaModel : wordCard.getWordsForms()) {
|
||||
if (flexiaModel.getPrefix().length() > 0) {
|
||||
flexiaModels.add(new FlexiaModel(flexiaModel.getCode(), flexiaModel.getSuffix(), ""));
|
||||
|
@ -27,9 +27,9 @@ import java.util.*;
|
||||
|
||||
//todo made refactoring this class
|
||||
public class StatisticsCollector implements WordProcessor {
|
||||
private TreeMap<String, Set<Heuristic>> inverseIndex = new TreeMap<String, Set<Heuristic>>();
|
||||
private Map<Set<Heuristic>, Integer> ruleInverseIndex = new HashMap<Set<Heuristic>, Integer>();
|
||||
private List<Set<Heuristic>> rules = new ArrayList<Set<Heuristic>>();
|
||||
private TreeMap<String, Set<Heuristic>> inverseIndex = new TreeMap<>();
|
||||
private Map<Set<Heuristic>, Integer> ruleInverseIndex = new HashMap<>();
|
||||
private List<Set<Heuristic>> rules = new ArrayList<>();
|
||||
private GrammarReader grammarReader;
|
||||
private LetterDecoderEncoder decoderEncoder;
|
||||
|
||||
@ -39,18 +39,14 @@ public class StatisticsCollector implements WordProcessor {
|
||||
this.decoderEncoder = decoderEncoder;
|
||||
}
|
||||
|
||||
public void process(WordCard wordCard) throws IOException {
|
||||
public void process(WordCard wordCard) {
|
||||
cleanWordCard(wordCard);
|
||||
String normalStringMorph = wordCard.getWordsForms().get(0).getCode();
|
||||
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph);
|
||||
String form = revertWord(fm.create(wordCard.getBase()));
|
||||
Set<Heuristic> suffixHeuristics = inverseIndex.get(form);
|
||||
if (suffixHeuristics == null) {
|
||||
suffixHeuristics = new HashSet<Heuristic>();
|
||||
inverseIndex.put(form, suffixHeuristics);
|
||||
}
|
||||
Set<Heuristic> suffixHeuristics = inverseIndex.computeIfAbsent(form, k -> new HashSet<>());
|
||||
suffixHeuristics.add(heuristic);
|
||||
}
|
||||
}
|
||||
@ -69,7 +65,7 @@ public class StatisticsCollector implements WordProcessor {
|
||||
|
||||
public void saveHeuristic(String fileName) throws IOException {
|
||||
|
||||
Map<Integer, Integer> dist = new TreeMap<Integer, Integer>();
|
||||
Map<Integer, Integer> dist = new TreeMap<>();
|
||||
Set<Heuristic> prevSet = null;
|
||||
int count = 0;
|
||||
for (String key : inverseIndex.keySet()) {
|
||||
@ -120,11 +116,11 @@ public class StatisticsCollector implements WordProcessor {
|
||||
}
|
||||
|
||||
private String revertWord(String s) {
|
||||
String result = "";
|
||||
StringBuilder result = new StringBuilder();
|
||||
for (int i = 1; i <= s.length(); i++) {
|
||||
result += s.charAt(s.length() - i);
|
||||
result.append(s.charAt(s.length() - i));
|
||||
}
|
||||
return result;
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
|
||||
@ -132,15 +128,15 @@ public class StatisticsCollector implements WordProcessor {
|
||||
String form = fm.create(wordBase);
|
||||
String normalForm = wordBase + canonicalSuffix;
|
||||
Integer length = getCommonLength(form, normalForm);
|
||||
Integer actualSuffixLengh = form.length() - length;
|
||||
int actualSuffixLengh = form.length() - length;
|
||||
String actualNormalSuffix = normalForm.substring(length);
|
||||
Integer integer = grammarReader.getGrammarInverseIndex().get(fm.getCode());
|
||||
Integer nf = grammarReader.getGrammarInverseIndex().get(normalSuffixForm);
|
||||
return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue());
|
||||
return new Heuristic((byte) actualSuffixLengh, actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue());
|
||||
}
|
||||
|
||||
public static Integer getCommonLength(String s1, String s2) {
|
||||
Integer length = Math.min(s1.length(), s2.length());
|
||||
int length = Math.min(s1.length(), s2.length());
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (s1.charAt(i) != s2.charAt(i)) return i;
|
||||
}
|
||||
|
@ -26,7 +26,7 @@ public class WordCard {
|
||||
private String canonicalForm;
|
||||
private String base;
|
||||
private String canonicalSuffix;
|
||||
private List<FlexiaModel> wordsForms = new ArrayList<FlexiaModel>();
|
||||
private List<FlexiaModel> wordsForms = new ArrayList<>();
|
||||
|
||||
public WordCard(String canonicalForm, String base, String canonicalSuffix) {
|
||||
this.canonicalForm = canonicalForm;
|
||||
|
@ -17,7 +17,6 @@ package org.apache.lucene.morphology.dictionary;
|
||||
|
||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
@ -38,7 +37,7 @@ public class WordCleaner extends WordFilter {
|
||||
if (word.contains("-")) return Collections.emptyList();
|
||||
if (!decoderEncoder.checkString(word)) return Collections.emptyList();
|
||||
|
||||
List<FlexiaModel> flexiaModelsToRemove = new LinkedList<FlexiaModel>();
|
||||
List<FlexiaModel> flexiaModelsToRemove = new LinkedList<>();
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
if (!decoderEncoder.checkString(fm.create(wordCard.getBase())) || fm.create(wordCard.getBase()).contains("-")) {
|
||||
flexiaModelsToRemove.add(fm);
|
||||
@ -48,6 +47,6 @@ public class WordCleaner extends WordFilter {
|
||||
wordCard.removeFlexia(fm);
|
||||
}
|
||||
|
||||
return new LinkedList<WordCard>(Arrays.asList(wordCard));
|
||||
return new LinkedList<>(Collections.singletonList(wordCard));
|
||||
}
|
||||
}
|
||||
|
@ -23,5 +23,5 @@ import java.io.IOException;
|
||||
*/
|
||||
public interface WordProcessor {
|
||||
|
||||
public void process(WordCard wordCard) throws IOException;
|
||||
void process(WordCard wordCard) throws IOException;
|
||||
}
|
||||
|
@ -17,7 +17,7 @@ package org.apache.lucene.morphology.dictionary;
|
||||
|
||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
@ -42,7 +42,7 @@ public class WordStringCleaner extends WordFilter {
|
||||
//made correct code
|
||||
m.setCode(m.getCode().substring(0, 2));
|
||||
}
|
||||
return new LinkedList<WordCard>(Arrays.asList(wordCard));
|
||||
return new LinkedList<>(Collections.singletonList(wordCard));
|
||||
}
|
||||
|
||||
|
||||
|
@ -29,7 +29,7 @@ public class EnglishHeuristicBuilder {
|
||||
GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/egramtab.tab");
|
||||
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
||||
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<>());
|
||||
|
||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder);
|
||||
WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector);
|
||||
@ -39,4 +39,4 @@ public class EnglishHeuristicBuilder {
|
||||
statisticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -28,7 +28,7 @@ public class RussianHeuristicBuilder {
|
||||
GrammarReader grammarInfo = new GrammarReader("dictonary/Dicts/Morph/rgramtab.tab");
|
||||
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();
|
||||
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
|
||||
DictionaryReader dictionaryReader = new DictionaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<>());
|
||||
|
||||
StatisticsCollector statisticsCollector = new StatisticsCollector(grammarInfo, decoderEncoder);
|
||||
WordCleaner wordCleaner = new WordCleaner(decoderEncoder, statisticsCollector);
|
||||
|
@ -23,6 +23,7 @@ import org.apache.lucene.morphology.english.EnglishMorphology;
|
||||
import org.apache.lucene.morphology.russian.RussianLetterDecoderEncoder;
|
||||
import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
|
||||
import org.apache.lucene.morphology.russian.RussianMorphology;
|
||||
import org.hamcrest.MatcherAssert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
@ -33,7 +34,6 @@ import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
import static org.hamcrest.Matchers.hasItem;
|
||||
import static org.junit.Assert.assertThat;
|
||||
|
||||
|
||||
public class TestAllWords {
|
||||
@ -73,21 +73,19 @@ public class TestAllWords {
|
||||
final List<String> morphInfo = grammarInfo.getGrammarInfo();
|
||||
final Map<String, Integer> inversIndex = grammarInfo.getGrammarInverseIndex();
|
||||
|
||||
DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet<String>());
|
||||
DictionaryReader dictionaryReader = new DictionaryReader(pathToDict, new HashSet<>());
|
||||
|
||||
final AtomicLong wordCount = new AtomicLong(0);
|
||||
Long startTime = System.currentTimeMillis();
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
WordProcessor wordProcessor = new WordProcessor() {
|
||||
public void process(WordCard wordCard) throws IOException {
|
||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
String wordForm = wordCard.getBase() + fm.getSuffix();
|
||||
String morph = morphInfo.get(inversIndex.get(fm.getCode()));
|
||||
assertThat(morphology.getMorphInfo(wordForm), hasItem(word + "|" + morph));
|
||||
assertThat(morphology.getNormalForms(wordForm), hasItem(word));
|
||||
wordCount.set(2L + wordCount.get());
|
||||
}
|
||||
WordProcessor wordProcessor = wordCard -> {
|
||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
String wordForm = wordCard.getBase() + fm.getSuffix();
|
||||
String morph = morphInfo.get(inversIndex.get(fm.getCode()));
|
||||
MatcherAssert.assertThat(morphology.getMorphInfo(wordForm), hasItem(word + "|" + morph));
|
||||
MatcherAssert.assertThat(morphology.getNormalForms(wordForm), hasItem(word));
|
||||
wordCount.set(2L + wordCount.get());
|
||||
}
|
||||
};
|
||||
|
||||
@ -123,17 +121,15 @@ public class TestAllWords {
|
||||
|
||||
private void testAllWordForLucene(final LuceneMorphology morphology, LetterDecoderEncoder decoderEncoder, String pathToDic) throws IOException {
|
||||
final AtomicLong wordCount = new AtomicLong(0);
|
||||
Long startTime = System.currentTimeMillis();
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<String>());
|
||||
WordProcessor wordProcessor = new WordProcessor() {
|
||||
public void process(WordCard wordCard) throws IOException {
|
||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
String wordForm = wordCard.getBase() + fm.getSuffix();
|
||||
assertThat(morphology.getNormalForms(wordForm), hasItem(word));
|
||||
wordCount.set(1L + wordCount.get());
|
||||
}
|
||||
DictionaryReader dictionaryReader = new DictionaryReader(pathToDic, new HashSet<>());
|
||||
WordProcessor wordProcessor = wordCard -> {
|
||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||
for (FlexiaModel fm : wordCard.getWordsForms()) {
|
||||
String wordForm = wordCard.getBase() + fm.getSuffix();
|
||||
MatcherAssert.assertThat(morphology.getNormalForms(wordForm), hasItem(word));
|
||||
wordCount.set(1L + wordCount.get());
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -31,9 +31,11 @@ import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.morphology.english.EnglishLuceneMorphology;
|
||||
import org.apache.lucene.morphology.russian.RussianAnalyzer;
|
||||
import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
|
||||
import org.hamcrest.MatcherAssert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.*;
|
||||
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
@ -65,24 +67,24 @@ public class AnalyzersTest extends BaseTokenStreamTestCase {
|
||||
LuceneMorphology englishLuceneMorphology = new EnglishLuceneMorphology();
|
||||
|
||||
MorphologyAnalyzer russianAnalyzer = new MorphologyAnalyzer(russianLuceneMorphology);
|
||||
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("тест пм тест".getBytes()), "UTF-8");
|
||||
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("тест пм тест".getBytes()), StandardCharsets.UTF_8);
|
||||
TokenStream stream = russianAnalyzer.tokenStream(null, reader);
|
||||
MorphologyFilter englishFilter = new MorphologyFilter(stream, englishLuceneMorphology);
|
||||
|
||||
englishFilter.reset();
|
||||
while (englishFilter.incrementToken()) {
|
||||
System.out.println(englishFilter.toString());
|
||||
System.out.println(englishFilter);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldProvideCorrectIndentForWordWithMelitaForm() throws IOException {
|
||||
Analyzer morphlogyAnalyzer = new RussianAnalyzer();
|
||||
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), "UTF-8");
|
||||
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), StandardCharsets.UTF_8);
|
||||
|
||||
TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
|
||||
tokenStream.reset();
|
||||
Set<String> foromsOfWine = new HashSet<String>();
|
||||
Set<String> foromsOfWine = new HashSet<>();
|
||||
foromsOfWine.add("вина");
|
||||
foromsOfWine.add("винo");
|
||||
boolean wordSeen = false;
|
||||
@ -90,7 +92,7 @@ public class AnalyzersTest extends BaseTokenStreamTestCase {
|
||||
CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
|
||||
if(foromsOfWine.contains(charTerm.toString()) && wordSeen){
|
||||
assertThat(position.getPositionIncrement(),equalTo(0));
|
||||
MatcherAssert.assertThat(position.getPositionIncrement(),equalTo(0));
|
||||
}
|
||||
if(foromsOfWine.contains(charTerm.toString())){
|
||||
wordSeen = true;
|
||||
@ -100,18 +102,18 @@ public class AnalyzersTest extends BaseTokenStreamTestCase {
|
||||
|
||||
private void testAnalayzer(Analyzer morphlogyAnalyzer, String answerPath, String testPath) throws IOException {
|
||||
InputStream stream = this.getClass().getResourceAsStream(answerPath);
|
||||
BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||
BufferedReader breader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
|
||||
String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" ");
|
||||
HashSet<String> answer = new HashSet<String>(Arrays.asList(strings));
|
||||
HashSet<String> answer = new HashSet<>(Arrays.asList(strings));
|
||||
stream.close();
|
||||
|
||||
stream = this.getClass().getResourceAsStream(testPath);
|
||||
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
InputStreamReader reader = new InputStreamReader(stream, StandardCharsets.UTF_8);
|
||||
|
||||
TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
|
||||
tokenStream.reset();
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
HashSet<String> result = new HashSet<>();
|
||||
while (tokenStream.incrementToken()) {
|
||||
CharTermAttribute attribute1 = tokenStream.getAttribute(CharTermAttribute.class);
|
||||
result.add(attribute1.toString());
|
||||
@ -119,7 +121,7 @@ public class AnalyzersTest extends BaseTokenStreamTestCase {
|
||||
|
||||
stream.close();
|
||||
|
||||
assertThat(result, equalTo(answer));
|
||||
MatcherAssert.assertThat(result, equalTo(answer));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -17,19 +17,20 @@ package org.apache.lucene.morphology;
|
||||
|
||||
import org.apache.lucene.morphology.russian.RussianLuceneMorphology;
|
||||
import org.apache.lucene.morphology.english.EnglishLuceneMorphology;
|
||||
import org.hamcrest.MatcherAssert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import static org.hamcrest.CoreMatchers.equalTo;
|
||||
import static org.junit.Assert.assertThat;
|
||||
|
||||
|
||||
public class LuceneMorphTest {
|
||||
@ -52,14 +53,13 @@ public class LuceneMorphTest {
|
||||
|
||||
private void testMorphology(LuceneMorphology luceneMorph, String pathToTestData) throws IOException {
|
||||
InputStream stream = this.getClass().getResourceAsStream(pathToTestData);
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
|
||||
String s = bufferedReader.readLine();
|
||||
while (s != null) {
|
||||
String[] qa = s.trim().split(" ");
|
||||
Set<String> result = new HashSet<String>();
|
||||
result.addAll(Arrays.asList(qa).subList(1, qa.length));
|
||||
Set<String> stringList = new HashSet<String>(luceneMorph.getNormalForms(qa[0]));
|
||||
assertThat(stringList, equalTo(result));
|
||||
Set<String> result = new HashSet<>(Arrays.asList(qa).subList(1, qa.length));
|
||||
Set<String> stringList = new HashSet<>(luceneMorph.getNormalForms(qa[0]));
|
||||
MatcherAssert.assertThat(stringList, equalTo(result));
|
||||
s = bufferedReader.readLine();
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user