adding english version
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@57 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
29
english/pom.xml
Normal file
29
english/pom.xml
Normal file
@ -0,0 +1,29 @@
|
||||
<?xml version="1.0"?>
|
||||
<project>
|
||||
<parent>
|
||||
<artifactId>morpholgy</artifactId>
|
||||
<groupId>org.apache.lucene.morpholgy</groupId>
|
||||
<version>0.7-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>org.apache.lucene.morpholgy</groupId>
|
||||
<artifactId>english</artifactId>
|
||||
<name>english</name>
|
||||
<version>0.7-SNAPSHOT</version>
|
||||
<url>http://maven.apache.org</url>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene.morpholgy</groupId>
|
||||
<artifactId>morph</artifactId>
|
||||
<version>0.7-SNAPSHOT</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.4</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
@ -0,0 +1,116 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morpholgy.english;
|
||||
|
||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||
import org.apache.lucene.morphology.SuffixToLongException;
|
||||
import org.apache.lucene.morphology.WrongCharaterException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
|
||||
//todo extract supper class for common method with russian letter decoder
|
||||
public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder {
|
||||
public static final int ENGLISH_SMALL_LETTER_OFFSET = 96;
|
||||
static public int SUFFIX_LENGTH = 6;
|
||||
public static final int DASH_CHAR = 45;
|
||||
public static final int DASH_CODE = 27;
|
||||
|
||||
public Integer encode(String string) {
|
||||
if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12);
|
||||
int result = 0;
|
||||
for (int i = 0; i < string.length(); i++) {
|
||||
int c = 0 + string.charAt(i) - ENGLISH_SMALL_LETTER_OFFSET;
|
||||
if (c == 45 - ENGLISH_SMALL_LETTER_OFFSET) {
|
||||
c = DASH_CODE;
|
||||
}
|
||||
if (c < 0 || c > 27)
|
||||
throw new WrongCharaterException("Symblo " + string.charAt(i) + " is not small cirillic letter");
|
||||
result = result * 28 + c;
|
||||
}
|
||||
for (int i = string.length(); i < 6; i++) {
|
||||
result *= 28;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public int[] encodeToArray(String s) {
|
||||
|
||||
ArrayList<Integer> integers = new ArrayList<Integer>();
|
||||
while (s.length() > 6) {
|
||||
integers.add(encode(s.substring(0, 6)));
|
||||
s = s.substring(6);
|
||||
}
|
||||
integers.add(encode(s));
|
||||
int[] ints = new int[integers.size()];
|
||||
int pos = 0;
|
||||
for (Integer i : integers) {
|
||||
ints[pos] = i;
|
||||
pos++;
|
||||
}
|
||||
return ints;
|
||||
}
|
||||
|
||||
public String decodeArray(int[] array) {
|
||||
String result = "";
|
||||
for (int i : array) {
|
||||
result += decode(i);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
public String decode(Integer suffixN) {
|
||||
String result = "";
|
||||
while (suffixN > 27) {
|
||||
int c = suffixN % 28 + ENGLISH_SMALL_LETTER_OFFSET;
|
||||
if (c == ENGLISH_SMALL_LETTER_OFFSET) {
|
||||
suffixN /= 28;
|
||||
continue;
|
||||
}
|
||||
if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
|
||||
result = (char) c + result;
|
||||
suffixN /= 28;
|
||||
}
|
||||
long c = suffixN + ENGLISH_SMALL_LETTER_OFFSET;
|
||||
if (c == DASH_CODE + ENGLISH_SMALL_LETTER_OFFSET) c = DASH_CHAR;
|
||||
result = (char) c + result;
|
||||
return result;
|
||||
}
|
||||
|
||||
public boolean checkCharacter(char c) {
|
||||
int code = 0 + c;
|
||||
if (code == 45) return true;
|
||||
code -= ENGLISH_SMALL_LETTER_OFFSET;
|
||||
if (code > 0 && code < 27) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public boolean checkString(String word) {
|
||||
for (int i = 0; i < word.length(); i++) {
|
||||
if (!checkCharacter(word.charAt(i))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public String cleanString(String s) {
|
||||
return s;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morpholgy.english;
|
||||
|
||||
import static org.hamcrest.core.IsEqual.equalTo;
|
||||
import static org.junit.Assert.assertThat;
|
||||
import org.junit.Before;
|
||||
|
||||
|
||||
public class EnglishLetterDecoderEncoderTest {
|
||||
private EnglishLetterDecoderEncoder decoderEncoder;
|
||||
|
||||
@Before
|
||||
public void setUp() {
|
||||
decoderEncoder = new EnglishLetterDecoderEncoder();
|
||||
}
|
||||
|
||||
@org.junit.Test
|
||||
public void testDecodeEncodeToArray() {
|
||||
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("abcdefghijklmnopqrstuvwxyz")), equalTo("abcdefghijklmnopqrstuvwxyz"));
|
||||
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("xyz")), equalTo("xyz"));
|
||||
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrty")), equalTo("ytrrty"));
|
||||
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyz")), equalTo("ytrrtyz"));
|
||||
assertThat(decoderEncoder.decodeArray(decoderEncoder.encodeToArray("ytrrtyzqwqwe")), equalTo("ytrrtyzqwqwe"));
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,38 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morpholgy.english;
|
||||
|
||||
import org.apache.lucene.morphology.LuceneMorph;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class RussianLuceneMorphTest {
|
||||
private LuceneMorph luceneMorph;
|
||||
|
||||
@Before
|
||||
public void setUp() throws IOException {
|
||||
luceneMorph = new LuceneMorph(this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shoudGetCorrentMorphInfo() throws IOException {
|
||||
System.out.println(luceneMorph.getMorhInfo("purchases"));
|
||||
System.out.println(luceneMorph.getMorhInfo("existing"));
|
||||
System.out.println(luceneMorph.getMorhInfo("was"));
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user