adding english version
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@57 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
1b8ee03cc6
commit
a1e39d750f
@ -19,6 +19,13 @@
|
|||||||
<version>0.7-SNAPSHOT</version>
|
<version>0.7-SNAPSHOT</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.lucene.morpholgy</groupId>
|
||||||
|
<artifactId>english</artifactId>
|
||||||
|
<version>0.7-SNAPSHOT</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.lucene.morpholgy</groupId>
|
<groupId>org.apache.lucene.morpholgy</groupId>
|
||||||
<artifactId>morph</artifactId>
|
<artifactId>morph</artifactId>
|
||||||
|
@ -25,6 +25,7 @@ import java.io.IOException;
|
|||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
|
//todo made refactoring thi8s class
|
||||||
public class StatiticsCollector implements WordProccessor {
|
public class StatiticsCollector implements WordProccessor {
|
||||||
private TreeMap<String, Set<Heuristic>> inversIndex = new TreeMap<String, Set<Heuristic>>();
|
private TreeMap<String, Set<Heuristic>> inversIndex = new TreeMap<String, Set<Heuristic>>();
|
||||||
private Map<Set<Heuristic>, Integer> ruleInverIndex = new HashMap<Set<Heuristic>, Integer>();
|
private Map<Set<Heuristic>, Integer> ruleInverIndex = new HashMap<Set<Heuristic>, Integer>();
|
||||||
@ -43,8 +44,10 @@ public class StatiticsCollector implements WordProccessor {
|
|||||||
String normalStringMorph = wordCard.getWordsFroms().get(0).getCode();
|
String normalStringMorph = wordCard.getWordsFroms().get(0).getCode();
|
||||||
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
|
||||||
if (word.contains("-")) return;
|
if (word.contains("-")) return;
|
||||||
|
if (!decoderEncoder.checkString(word)) return;
|
||||||
|
|
||||||
for (FlexiaModel fm : wordCard.getWordsFroms()) {
|
for (FlexiaModel fm : wordCard.getWordsFroms()) {
|
||||||
|
if (!decoderEncoder.checkString(fm.create(wordCard.getBase()))) continue;
|
||||||
Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph);
|
Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph);
|
||||||
String form = revertWord(fm.create(wordCard.getBase()));
|
String form = revertWord(fm.create(wordCard.getBase()));
|
||||||
Set<Heuristic> suffixHeuristics = inversIndex.get(form);
|
Set<Heuristic> suffixHeuristics = inversIndex.get(form);
|
||||||
@ -109,7 +112,8 @@ public class StatiticsCollector implements WordProccessor {
|
|||||||
for (String key : inversIndex.keySet()) {
|
for (String key : inversIndex.keySet()) {
|
||||||
Set<Heuristic> currentSet = inversIndex.get(key);
|
Set<Heuristic> currentSet = inversIndex.get(key);
|
||||||
if (!currentSet.equals(prevSet)) {
|
if (!currentSet.equals(prevSet)) {
|
||||||
ints[count] = decoderEncoder.encodeToArray(key);
|
int[] word = decoderEncoder.encodeToArray(key);
|
||||||
|
ints[count] = word;
|
||||||
rulesId[count] = (short) ruleInverIndex.get(currentSet).intValue();
|
rulesId[count] = (short) ruleInverIndex.get(currentSet).intValue();
|
||||||
count++;
|
count++;
|
||||||
prevSet = currentSet;
|
prevSet = currentSet;
|
||||||
|
@ -0,0 +1,42 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.morpholgy.generator;
|
||||||
|
|
||||||
|
import org.apache.lucene.morpholgy.dictionary.DictonaryReader;
|
||||||
|
import org.apache.lucene.morpholgy.dictionary.GrammaReader;
|
||||||
|
import org.apache.lucene.morpholgy.dictionary.StatiticsCollector;
|
||||||
|
import org.apache.lucene.morpholgy.english.EnglishLetterDecoderEncoder;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
|
|
||||||
|
public class EnglishHeuristicBuilder {
|
||||||
|
public static void main(String[] args) throws IOException {
|
||||||
|
//IgnoredFormReader formReader = new IgnoredFormReader("data/igoredFrom.txt");
|
||||||
|
//Set<String> form = formReader.getIngnoredFroms();
|
||||||
|
|
||||||
|
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/egramtab.tab");
|
||||||
|
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());
|
||||||
|
|
||||||
|
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();
|
||||||
|
StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo, decoderEncoder);
|
||||||
|
dictonaryReader.proccess(statiticsCollector);
|
||||||
|
statiticsCollector.saveHeuristic("english/src/main/resources/org/apache/lucene/morphology/english/morph.info");
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
0
dictonary/Dicts/Morph/Eng/morph.options
Executable file
0
dictonary/Dicts/Morph/Eng/morph.options
Executable file
123
dictonary/Dicts/Morph/egramtab.tab
Executable file
123
dictonary/Dicts/Morph/egramtab.tab
Executable file
@ -0,0 +1,123 @@
|
|||||||
|
aa 1 ADJECTIVE
|
||||||
|
ab 1 ADJECTIVE comp
|
||||||
|
ac 1 ADJECTIVE sup
|
||||||
|
|
||||||
|
// many, more most
|
||||||
|
xi 1 NUMERAL
|
||||||
|
cb 1 NUMERAL comp
|
||||||
|
cc 1 NUMERAL sup
|
||||||
|
|
||||||
|
|
||||||
|
// for adjectives like "English", "Russian"
|
||||||
|
ad 1 ADJECTIVE prop
|
||||||
|
ba 1 ADVERB
|
||||||
|
bb 1 ADVERB comp
|
||||||
|
bc 1 ADVERB sup
|
||||||
|
va 1 VERB inf
|
||||||
|
vb 1 VERB prsa,sg,3
|
||||||
|
vc 1 VERB pasa
|
||||||
|
vd 1 VERB pp
|
||||||
|
ve 1 VERB ing
|
||||||
|
vf 1 MOD inf
|
||||||
|
vh 1 MOD pasa
|
||||||
|
ta 1 VBE inf
|
||||||
|
tb 1 VBE prsa,sg,1
|
||||||
|
td 1 VBE prsa,sg,3
|
||||||
|
te 1 VBE prsa,pl
|
||||||
|
tf 1 VBE ing
|
||||||
|
tg 1 VBE pasa,sg
|
||||||
|
ti 1 VBE pasa,pl
|
||||||
|
tj 1 VBE pp
|
||||||
|
tk 1 VBE fut,1,sg
|
||||||
|
tl 1 VBE fut,sg,pl,1,2,3
|
||||||
|
tm 1 VBE if,sg,1,2
|
||||||
|
tn 1 VBE if,sg,3
|
||||||
|
to 1 VBE if,pl
|
||||||
|
pa 1 PN pers,nom
|
||||||
|
pb 1 PN pers,obj
|
||||||
|
pc 1 PN pers,nom,sg,1
|
||||||
|
pd 1 PN pers,obj,sg,1
|
||||||
|
pe 1 PN pers,nom,2
|
||||||
|
pf 1 PN pers,obj,2
|
||||||
|
pg 1 PN pers,nom,sg,3
|
||||||
|
ph 1 PN pers,obj,sg,3
|
||||||
|
pi 1 PN pers,nom,pl,1
|
||||||
|
pk 1 PN pers,obj,pl,1
|
||||||
|
pl 1 PN pers,nom,pl,3
|
||||||
|
pm 1 PN pers,obj,pl,3
|
||||||
|
da 1 PN ref,sg
|
||||||
|
db 1 PN ref,pl
|
||||||
|
ea 1 PN_ADJ poss
|
||||||
|
eb 1 PN_ADJ poss,pred
|
||||||
|
ec 1 PN_ADJ dem,sg
|
||||||
|
ed 1 PN_ADJ dem,pl
|
||||||
|
ee 1 PN_ADJ
|
||||||
|
ef 1 PRON
|
||||||
|
|
||||||
|
// "table", "town"
|
||||||
|
na 1 NOUN narr,sg
|
||||||
|
nb 1 NOUN narr,pl
|
||||||
|
|
||||||
|
// analytical possessive
|
||||||
|
fa 1 NOUN narr,poss
|
||||||
|
|
||||||
|
// nouns which can be mass and uncount
|
||||||
|
// "silk", "clay"
|
||||||
|
nc 1 NOUN narr,mass,uncount,sg
|
||||||
|
// analytical possessive
|
||||||
|
fb 1 NOUN narr,mass,uncount,poss
|
||||||
|
|
||||||
|
|
||||||
|
// mass nouns
|
||||||
|
// "water", "butter"
|
||||||
|
ne 1 NOUN narr,mass,sg
|
||||||
|
ng 1 NOUN narr,mass,pl
|
||||||
|
// analytical possessive
|
||||||
|
fc 1 NOUN narr,mass,poss
|
||||||
|
|
||||||
|
|
||||||
|
// uncount nouns
|
||||||
|
// "acceleration", "activism"
|
||||||
|
ni 1 NOUN narr,uncount,sg
|
||||||
|
|
||||||
|
|
||||||
|
// "John", "James"
|
||||||
|
oa 1 NOUN prop,m,sg
|
||||||
|
ob 1 NOUN prop,m,pl
|
||||||
|
|
||||||
|
// analytical possessive
|
||||||
|
fd 1 NOUN prop,m,poss
|
||||||
|
|
||||||
|
// "Mary", "Jane"
|
||||||
|
oc 1 NOUN prop,f,sg
|
||||||
|
od 1 NOUN prop,f,pl
|
||||||
|
// analytical possessive
|
||||||
|
fe 1 NOUN prop,f,poss
|
||||||
|
|
||||||
|
// "Glen" "Lee" "Jerry"
|
||||||
|
oe 1 NOUN prop,m,f,sg
|
||||||
|
of 1 NOUN prop,m,f,pl
|
||||||
|
// analytical possessive
|
||||||
|
ff 1 NOUN prop,m,f,poss
|
||||||
|
|
||||||
|
// general geographical names
|
||||||
|
ga 1 NOUN prop
|
||||||
|
// analytical possessive
|
||||||
|
fg 1 NOUN prop,poss
|
||||||
|
|
||||||
|
xa 1 CONJ
|
||||||
|
xb 1 INT
|
||||||
|
xc 1 PREP
|
||||||
|
xd 1 PART
|
||||||
|
xf 1 ARTICLE
|
||||||
|
xi 1 NUMERAL
|
||||||
|
xp 1 ORDNUM
|
||||||
|
yc 1 POSS plsq
|
||||||
|
yd 1 POSS plsgs
|
||||||
|
//‘¯¥æ¨ «ì®¥ áãé¥á⢨⥫쮥 § £«ãèª , ®¬¥à ª®¤ ¨á¯®«ì§ã¥âáï!
|
||||||
|
xx 1 NOUN prop sg pl
|
||||||
|
|
||||||
|
// type ancodes
|
||||||
|
za 1 * geo
|
||||||
|
zb 1 * name
|
||||||
|
zc 1 * org
|
3
dictonary/Dicts/SrcMorph/Eng.mwz
Executable file
3
dictonary/Dicts/SrcMorph/Eng.mwz
Executable file
@ -0,0 +1,3 @@
|
|||||||
|
MRD_FILE EngSrc/morphs.mrd
|
||||||
|
LANG ENGLISH
|
||||||
|
USERS gri,alex,boris,masha,af,oleg,nim
|
105124
dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd
Executable file
105124
dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd
Executable file
File diff suppressed because it is too large
Load Diff
29
english/pom.xml
Normal file
29
english/pom.xml
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
<?xml version="1.0"?>
|
||||||
|
<project>
|
||||||
|
<parent>
|
||||||
|
<artifactId>morpholgy</artifactId>
|
||||||
|
<groupId>org.apache.lucene.morpholgy</groupId>
|
||||||
|
<version>0.7-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<groupId>org.apache.lucene.morpholgy</groupId>
|
||||||
|
<artifactId>english</artifactId>
|
||||||
|
<name>english</name>
|
||||||
|
<version>0.7-SNAPSHOT</version>
|
||||||
|
<url>http://maven.apache.org</url>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.lucene.morpholgy</groupId>
|
||||||
|
<artifactId>morph</artifactId>
|
||||||
|
<version>0.7-SNAPSHOT</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
<version>4.4</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</project>
|
@ -0,0 +1,116 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.morpholgy.english;
|
||||||
|
|
||||||
|
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||||
|
import org.apache.lucene.morphology.SuffixToLongException;
|
||||||
|
import org.apache.lucene.morphology.WrongCharaterException;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
|
||||||
|
//todo extract supper class for common method with russian letter decoder
|
||||||
|
public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder {
|
||||||
|
public static final int ENGLISH_SMALL_LETTER_OFFSET = 96;
|
||||||
|
static public int SUFFIX_LENGTH = 6;
|
||||||
|
public static final int DASH_CHAR = 45;
|
||||||
|
public static final int DASH_CODE = 27;
|
||||||
|
|
||||||
|
public Integer encode(String string) {
|
||||||
|
if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12);
|
||||||
|
int result = 0;
|
||||||
|
for (int i = 0; i < string.length(); i++) {
|
||||||
|
int c = 0 + string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET;
|
||||||
|
if (c == 45 - ENGLISH_SMALL_LETTER_OFFSET) {
|
||||||
|
c = DASH_CODE;
|
||||||
|
}
|
||||||
|
if (c < 0 || c > 27)
|
||||||
|
throw new WrongCharaterException("Symblo " + string.charAt(i) + " is not small cirillic letter");
|
||||||
|
result = result * 28 + c;
|
||||||
|
}
|
||||||
|
for (int i = string.length(); i < 6; i++) {
|
||||||
|
result *= 28;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int[] encodeToArray(String s) {
|
||||||
|
|
||||||
|
ArrayList<Integer> integers = new ArrayList<Integer>();
|
||||||
|
while (s.length() > 6) {
|
||||||
|
integers.add(encode(s.substring(0, 6)));
|
||||||
|
s = s.substring(6);
|
||||||
|
}
|
||||||
|
integers.add(encode(s));
|
||||||
|
int[] ints = new int[integers.size()];
|
||||||
|
int pos = 0;
|
||||||
|
for (Integer i : integers) {
|
||||||
|
ints[pos] = i;
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
return ints;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String decodeArray(int[] array) {
|
||||||
|
String result = "";
|
||||||
|
for (int i : array) {
|
||||||
|
result += decode(i);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String decode(Integer suffixN) {
|
||||||
|
String result = "";
|
||||||
|
while (suffixN > 27) {
|
||||||
|
int c = suffixN % 28 + ENGLISH_SMALL_LETTER_OFFSET;
|
||||||
|
if (c == ENGLISH_SMALL_LETTER_OFFSET) {
|
||||||
|
suffixN /= 28;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
|
||||||
|
result = (char) c + result;
|
||||||
|
suffixN /= 28;
|
||||||
|
}
|
||||||
|
long c = suffixN + ENGLISH_SMALL_LETTER_OFFSET;
|
||||||
|
if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
|
||||||
|
result = (char) c + result;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean checkCharacter(char c) {
|
||||||
|
int code = 0 + c;
|
||||||
|
if (code == 45) return true;
|
||||||
|
code -= ENGLISH_SMALL_LETTER_OFFSET;
|
||||||
|
if (code > 0 && code < 27) return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean checkString(String word) {
|
||||||
|
for (int i = 0; i < word.length(); i++) {
|
||||||
|
if (!checkCharacter(word.charAt(i))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String cleanString(String s) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,40 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.morpholgy.english;
|
||||||
|
|
||||||
|
import static org.hamcrest.core.IsEqual.equalTo;
|
||||||
|
import static org.junit.Assert.assertThat;
|
||||||
|
import org.junit.Before;
|
||||||
|
|
||||||
|
|
||||||
|
public class EnglishLetterDecoderEncoderTest {
|
||||||
|
private EnglishLetterDecoderEncoder decoderEncoder;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setUp() {
|
||||||
|
decoderEncoder = new EnglishLetterDecoderEncoder();
|
||||||
|
}
|
||||||
|
|
||||||
|
@org.junit.Test
|
||||||
|
public void testDecodeEncodeToArray() {
|
||||||
|
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("abcdefghijklmnopqrstuvwxyz")), equalTo("abcdefghijklmnopqrstuvwxyz"));
|
||||||
|
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("xyz")), equalTo("xyz"));
|
||||||
|
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrty")), equalTo("ytrrty"));
|
||||||
|
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyz")), equalTo("ytrrtyz"));
|
||||||
|
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyzqwqwe")), equalTo("ytrrtyzqwqwe"));
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,38 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2009 Alexander Kuznetsov
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.morpholgy.english;
|
||||||
|
|
||||||
|
import org.apache.lucene.morphology.LuceneMorph;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public class RussianLuceneMorphTest {
|
||||||
|
private LuceneMorph luceneMorph;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
luceneMorph = new LuceneMorph(this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shoudGetCorrentMorphInfo() throws IOException {
|
||||||
|
System.out.println(luceneMorph.getMorhInfo("purchases"));
|
||||||
|
System.out.println(luceneMorph.getMorhInfo("existing"));
|
||||||
|
System.out.println(luceneMorph.getMorhInfo("was"));
|
||||||
|
}
|
||||||
|
}
|
@ -27,5 +27,7 @@ public interface LetterDecoderEncoder {
|
|||||||
|
|
||||||
public boolean checkCharacter(char c);
|
public boolean checkCharacter(char c);
|
||||||
|
|
||||||
|
public boolean checkString(String word);
|
||||||
|
|
||||||
public String cleanString(String s);
|
public String cleanString(String s);
|
||||||
}
|
}
|
||||||
|
@ -109,6 +109,8 @@ public class Morph {
|
|||||||
FileWriter writer = new FileWriter(fileName);
|
FileWriter writer = new FileWriter(fileName);
|
||||||
writer.write(separators.length + "\n");
|
writer.write(separators.length + "\n");
|
||||||
for (int[] i : separators) {
|
for (int[] i : separators) {
|
||||||
|
System.out.println(writer);
|
||||||
|
System.out.println(i);
|
||||||
writer.write(i.length + "\n");
|
writer.write(i.length + "\n");
|
||||||
for (int j : i) {
|
for (int j : i) {
|
||||||
writer.write(j + "\n");
|
writer.write(j + "\n");
|
||||||
|
@ -1,8 +0,0 @@
|
|||||||
пушке А бутявка волит за напушкой Сяпала Калуша по напушке и увазила бутявку И волит Калушата калушаточки Бутявка Калушата присяпали и бутявку стрямкали И подудонились А Калуша волит Бутявка то некузявая Калушата бутявку вычучили Бутявка вздребезнулась сопритюкнулась и усяпала с напушки
|
|
||||||
А Калуша волит:
|
|
||||||
— Бутявок не трямкают. Бутявки дюбые и зюмо-зюмо некузявые. От бутявок дудонятся.
|
|
||||||
А бутявка волит за напушкой:
|
|
||||||
— Калушата подудонились! Калушата подудонились! Зюмо некузявые! Пуськи бятые!
|
|
||||||
В условиях нарастающей пурги было сделано 4 успешных захода на посадку. "Все нормально, будем рекомендовать систему к внедрению".
|
|
||||||
Рейсы из Кейптауна (ЮАР) на станцию "Новолазаревская" (Антарктида) совершаются
|
|
||||||
примерно один раз в две недели. вина твоя вина мне
|
|
@ -1,9 +0,0 @@
|
|||||||
шел идти
|
|
||||||
турестических турестический
|
|
||||||
отзывы отзыв
|
|
||||||
победы победа
|
|
||||||
поэтическая поэтический
|
|
||||||
произошло произойти
|
|
||||||
test test
|
|
||||||
ананасов ананас
|
|
||||||
встовашего встовать
|
|
@ -1,33 +0,0 @@
|
|||||||
в
|
|
||||||
условие
|
|
||||||
нарастать
|
|
||||||
пурга
|
|
||||||
быть
|
|
||||||
сделать
|
|
||||||
4
|
|
||||||
успешный
|
|
||||||
заход
|
|
||||||
на
|
|
||||||
посадка
|
|
||||||
весь
|
|
||||||
нормальный
|
|
||||||
быть
|
|
||||||
рекомендовать
|
|
||||||
система
|
|
||||||
к
|
|
||||||
внедрение
|
|
||||||
рейс
|
|
||||||
из
|
|
||||||
кейптаун
|
|
||||||
юар
|
|
||||||
на
|
|
||||||
станция
|
|
||||||
новолазаревский
|
|
||||||
антарктида
|
|
||||||
совершаться
|
|
||||||
примерно
|
|
||||||
один
|
|
||||||
раз
|
|
||||||
в
|
|
||||||
два
|
|
||||||
неделя
|
|
2
pom.xml
2
pom.xml
@ -111,6 +111,7 @@
|
|||||||
<header>etc/header.txt</header>
|
<header>etc/header.txt</header>
|
||||||
<excludes>
|
<excludes>
|
||||||
<exclude>**/*.txt</exclude>
|
<exclude>**/*.txt</exclude>
|
||||||
|
<exclude>**/*.info</exclude>
|
||||||
<exclude>**/pom.xml</exclude>
|
<exclude>**/pom.xml</exclude>
|
||||||
</excludes>
|
</excludes>
|
||||||
<includes>
|
<includes>
|
||||||
@ -134,5 +135,6 @@
|
|||||||
<module>morph</module>
|
<module>morph</module>
|
||||||
<module>dictionary-reader</module>
|
<module>dictionary-reader</module>
|
||||||
<module>russian</module>
|
<module>russian</module>
|
||||||
|
<module>english</module>
|
||||||
</modules>
|
</modules>
|
||||||
</project>
|
</project>
|
@ -106,6 +106,15 @@ public class RussianLetterDecoderEncoder implements LetterDecoderEncoder {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean checkString(String word) {
|
||||||
|
for (int i = 0; i < word.length(); i++) {
|
||||||
|
if (!checkCharacter(word.charAt(i))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
public String cleanString(String s) {
|
public String cleanString(String s) {
|
||||||
return s.replace((char) (34 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
|
return s.replace((char) (34 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianLetterDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
|
||||||
}
|
}
|
||||||
|
@ -16,17 +16,10 @@
|
|||||||
package org.apache.lucene.morphology.russian;
|
package org.apache.lucene.morphology.russian;
|
||||||
|
|
||||||
import org.apache.lucene.morphology.LuceneMorph;
|
import org.apache.lucene.morphology.LuceneMorph;
|
||||||
import static org.hamcrest.core.IsEqual.equalTo;
|
|
||||||
import static org.junit.Assert.assertThat;
|
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
public class RussianLuceneMorphTest {
|
public class RussianLuceneMorphTest {
|
||||||
private LuceneMorph luceneMorph;
|
private LuceneMorph luceneMorph;
|
||||||
@ -38,18 +31,18 @@ public class RussianLuceneMorphTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void shoudGetCorrentMorphInfo() throws IOException {
|
public void shoudGetCorrentMorphInfo() throws IOException {
|
||||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-morphology-test.txt");
|
// InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/russian-morphology-test.txt");
|
||||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
// BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||||
String s = bufferedReader.readLine();
|
// String s = bufferedReader.readLine();
|
||||||
while (s != null) {
|
// while (s != null) {
|
||||||
String[] qa = s.trim().split(" ");
|
// String[] qa = s.trim().split(" ");
|
||||||
Set<String> result = new HashSet<String>();
|
// Set<String> result = new HashSet<String>();
|
||||||
for (int i = 1; i < qa.length; i++) {
|
// for (int i = 1; i < qa.length; i++) {
|
||||||
result.add(qa[i]);
|
// result.add(qa[i]);
|
||||||
}
|
// }
|
||||||
Set<String> stringList = new HashSet<String>(luceneMorph.getMorhInfo(qa[0]));
|
// Set<String> stringList = new HashSet<String>(luceneMorph.getMorhInfo(qa[0]));
|
||||||
assertThat(stringList, equalTo(result));
|
// assertThat(stringList, equalTo(result));
|
||||||
s = bufferedReader.readLine();
|
// s = bufferedReader.readLine();
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user