adding modules
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@49 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
@ -0,0 +1,39 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.morphology.russian;
|
||||
|
||||
import org.apache.lucene.morpholgy.dictionary.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
public class HeuristicBuilder {
|
||||
public static void main(String[] args) throws IOException {
|
||||
IgnoredFormReader formReader = new IgnoredFormReader("data/igoredFrom.txt");
|
||||
Set<String> form = formReader.getIngnoredFroms();
|
||||
|
||||
FrequentyReader frequentyReader = new FrequentyReader("data/lemma.num");
|
||||
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab");
|
||||
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form);
|
||||
|
||||
StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo);
|
||||
dictonaryReader.proccess(statiticsCollector);
|
||||
statiticsCollector.saveHeuristic();
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,112 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.morphology.russian;
|
||||
|
||||
import org.apache.lucene.morphology.LetterDecoderEncoder;
|
||||
import org.apache.lucene.morphology.SuffixToLongException;
|
||||
import org.apache.lucene.morphology.WrongCharaterException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* This helper class allow encode suffix of russian word
|
||||
* to long value and decode from it.
|
||||
* Assumed that suffix contains only small russian letters and dash.
|
||||
* Also assumed that letter <20> and <20> coinsed.
|
||||
*/
|
||||
public class RussianSuffixDecoderEncoder implements LetterDecoderEncoder {
|
||||
public static final int RUSSIAN_SMALL_LETTER_OFFSET = 1071;
|
||||
static public int SUFFIX_LENGTH = 6;
|
||||
public static final int EE_CHAR = 34;
|
||||
public static final int E_CHAR = 6;
|
||||
public static final int DASH_CHAR = 45;
|
||||
public static final int DASH_CODE = 33;
|
||||
|
||||
public Integer encode(String string) {
|
||||
if (string.length() > 6) throw new SuffixToLongException("Suffix length should not be greater then " + 12);
|
||||
int result = 0;
|
||||
for (int i = 0; i < string.length(); i++) {
|
||||
int c = 0 + string.charAt(i) - RUSSIAN_SMALL_LETTER_OFFSET;
|
||||
if (c == 45 - RUSSIAN_SMALL_LETTER_OFFSET) {
|
||||
c = DASH_CODE;
|
||||
}
|
||||
if (c == EE_CHAR) c = E_CHAR;
|
||||
if (c < 0 || c > 33)
|
||||
throw new WrongCharaterException("Symblo " + string.charAt(i) + " is not small cirillic letter");
|
||||
result = result * 34 + c;
|
||||
}
|
||||
for (int i = string.length(); i < 6; i++) {
|
||||
result *= 34;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public int[] encodeToArray(String s) {
|
||||
ArrayList<Integer> integers = new ArrayList<Integer>();
|
||||
while (s.length() > 6) {
|
||||
integers.add(encode(s.substring(0, 6)));
|
||||
s = s.substring(6);
|
||||
}
|
||||
integers.add(encode(s));
|
||||
int[] ints = new int[integers.size()];
|
||||
int pos = 0;
|
||||
for (Integer i : integers) {
|
||||
ints[pos] = i;
|
||||
pos++;
|
||||
}
|
||||
return ints;
|
||||
}
|
||||
|
||||
public String decodeArray(int[] array) {
|
||||
String result = "";
|
||||
for (int i : array) {
|
||||
result += decode(i);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
public String decode(Integer suffixN) {
|
||||
String result = "";
|
||||
while (suffixN > 33) {
|
||||
int c = suffixN % 34 + RUSSIAN_SMALL_LETTER_OFFSET;
|
||||
if (c == RUSSIAN_SMALL_LETTER_OFFSET) {
|
||||
suffixN /= 34;
|
||||
continue;
|
||||
}
|
||||
if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR;
|
||||
result = (char) c + result;
|
||||
suffixN /= 34;
|
||||
}
|
||||
long c = suffixN + RUSSIAN_SMALL_LETTER_OFFSET;
|
||||
if (c == DASH_CODE + RUSSIAN_SMALL_LETTER_OFFSET) c = DASH_CHAR;
|
||||
result = (char) c + result;
|
||||
return result;
|
||||
}
|
||||
|
||||
public boolean checkCharacter(char c) {
|
||||
int code = 0 + c;
|
||||
if (code == 45) return true;
|
||||
code -= RUSSIAN_SMALL_LETTER_OFFSET;
|
||||
if (code > 0 && code < 33) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public String cleanString(String s) {
|
||||
return s;
|
||||
}
|
||||
}
|
@ -0,0 +1,63 @@
|
||||
/**
|
||||
* Copyright 2009 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology.russian;
|
||||
|
||||
|
||||
import org.apache.lucene.morphology.Heuristic;
|
||||
import org.apache.lucene.morphology.Morph;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: akuznetsov
|
||||
* Date: 15.08.2009
|
||||
* Time: 16:52:24
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class Test {
|
||||
|
||||
public static void main(String[] args) throws IOException, ClassNotFoundException {
|
||||
//
|
||||
Morph splitter = new Morph("sep.txt");
|
||||
TreeSet<Short> shorts = new TreeSet<Short>();
|
||||
int count = 0;
|
||||
TreeMap<Integer, Integer> rulesStat = new TreeMap<Integer, Integer>();
|
||||
for (Heuristic[] heuristics : splitter.getRules()) {
|
||||
Integer d = rulesStat.get(heuristics.length);
|
||||
rulesStat.put(heuristics.length, 1 + (d == null ? 0 : d));
|
||||
boolean flag = true;
|
||||
short actualSuffixLenght = heuristics[0].getActualSuffixLengh();
|
||||
String normalSuffix = heuristics[0].getActualNormalSuffix();
|
||||
for (Heuristic heuristic : heuristics) {
|
||||
flag = flag && (heuristic.getActualSuffixLengh() == actualSuffixLenght)
|
||||
&& normalSuffix.equals(heuristic.getActualNormalSuffix());
|
||||
}
|
||||
if (!flag) {
|
||||
System.out.println(Arrays.asList(heuristics));
|
||||
count++;
|
||||
}
|
||||
}
|
||||
System.out.println(count);
|
||||
System.out.println(rulesStat);
|
||||
System.gc();
|
||||
System.out.println("Ready");
|
||||
System.in.read();
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user