adding modules

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@49 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov
2009-10-02 16:25:08 +00:00
parent 786ce92ae0
commit 710384987c
36 changed files with 221 additions and 695427 deletions

22
dictionary-reader/pom.xml Normal file
View File

@ -0,0 +1,22 @@
<?xml version="1.0"?>
<project>
<parent>
<artifactId>morpholgy</artifactId>
<groupId>org.apache.lucene.morpholgy</groupId>
<version>0.7-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<groupId>org.apache.lucene.morpholgy</groupId>
<artifactId>dictionary-reader</artifactId>
<name>dictionary-reader</name>
<version>0.7-SNAPSHOT</version>
<url>http://maven.apache.org</url>
<dependencies>
<dependency>
<groupId>org.apache.lucene.morpholgy</groupId>
<artifactId>morph</artifactId>
<version>0.7-SNAPSHOT</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,125 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morpholgy.dictionary;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.*;
/**
* This class contain logic how read
* dictonary and produce word with it all forms.
*/
public class DictonaryReader {
private String fileName;
private String fileEncoding = "windows-1251";
private List<List<FlexiaModel>> wordsFlexias = new ArrayList<List<FlexiaModel>>();
private List<List<String>> wordPrefixes = new ArrayList<List<String>>();
private Set<String> ingnoredForm = new HashSet<String>();
public DictonaryReader(String fileName, Set<String> ingnoredForm) {
this.fileName = fileName;
this.ingnoredForm = ingnoredForm;
}
public DictonaryReader(String fileName, String fileEncoding, Set<String> ingnoredForm) {
this.fileName = fileName;
this.fileEncoding = fileEncoding;
this.ingnoredForm = ingnoredForm;
}
public void proccess(WordProccessor wordProccessor) throws IOException {
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), fileEncoding));
readFlexias(bufferedReader);
sckipBlock(bufferedReader);
sckipBlock(bufferedReader);
readPrefix(bufferedReader);
readWords(bufferedReader, wordProccessor);
}
private void readWords(BufferedReader reader, WordProccessor wordProccessor) throws IOException {
String s = reader.readLine();
int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) {
s = reader.readLine();
if (i % 10000 == 0) System.out.println("Proccess " + i + " wordBase of " + count);
String[] wd = s.split(" ");
String wordBase = wd[0].toLowerCase();
if (wordBase.startsWith("-")) continue;
wordBase = "#".equals(wordBase) ? "" : wordBase;
List<FlexiaModel> models = wordsFlexias.get(Integer.valueOf(wd[1]));
FlexiaModel flexiaModel = models.get(0);
if (models.size() > 0 && !ingnoredForm.contains(flexiaModel.getCode())) {
WordCard card = new WordCard(flexiaModel.create(wordBase), wordBase, flexiaModel.getSuffix());
for (FlexiaModel fm : models) {
card.addFlexia(fm);
}
wordProccessor.proccess(card);
}
}
}
private void sckipBlock(BufferedReader reader) throws IOException {
String s = reader.readLine();
int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) {
s = reader.readLine();
}
}
private void readPrefix(BufferedReader reader) throws IOException {
String s = reader.readLine();
int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) {
s = reader.readLine();
wordPrefixes.add(Arrays.asList(s.toLowerCase().split(",")));
}
}
private void readFlexias(BufferedReader reader) throws IOException {
String s = reader.readLine();
int count = Integer.valueOf(s);
for (int i = 0; i < count; i++) {
s = reader.readLine();
ArrayList<FlexiaModel> flexiaModelArrayList = new ArrayList<FlexiaModel>();
wordsFlexias.add(flexiaModelArrayList);
for (String line : s.split("%")) {
addFlexia(flexiaModelArrayList, line);
}
}
}
private void addFlexia(ArrayList<FlexiaModel> flexiaModelArrayList, String line) {
String[] fl = line.split("\\*");
// we inored all forms thats
if (fl.length == 3) {
System.out.println(line);
// flexiaModelArrayList.add(new FlexiaModel(fl[1], cleanString(fl[0].toLowerCase()), cleanString(fl[2].toLowerCase())));
}
if (fl.length == 2) flexiaModelArrayList.add(new FlexiaModel(fl[1], fl[0].toLowerCase(), ""));
}
}

View File

@ -0,0 +1,65 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morpholgy.dictionary;
/**
* Represent inofrmation of how word form created form it imutible part.
*/
public class FlexiaModel {
private String code;
private String suffix;
private String prefix;
public FlexiaModel(String code, String suffix, String prefix) {
this.code = code;
this.suffix = suffix;
this.prefix = prefix;
}
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
public String getSuffix() {
return suffix;
}
public void setSuffix(String suffix) {
this.suffix = suffix;
}
public String getPrefix() {
return prefix;
}
public void setPrefix(String prefix) {
this.prefix = prefix;
}
public String create(String s) {
return prefix + s + suffix;
}
@Override
public String toString() {
return prefix + " " + suffix;
}
}

View File

@ -0,0 +1,56 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morpholgy.dictionary;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
public class FrequentyReader {
private String fileName;
private String fileEncoding = "windows-1251";
public FrequentyReader(String fileName) {
this.fileName = fileName;
}
public FrequentyReader(String fileName, String fileEncoding) {
this.fileName = fileName;
this.fileEncoding = fileEncoding;
}
public Map<String, Double> read() throws IOException {
Map<String, Double> result = new HashMap<String, Double>();
BufferedReader bufferedReader = new BufferedReader(
new InputStreamReader(
new FileInputStream(fileName), fileEncoding));
String s = bufferedReader.readLine();
while (s != null) {
String[] strings = s.split(" ");
Double value = Double.valueOf(strings[1]);
result.put(strings[2], value);
s = bufferedReader.readLine();
}
return result;
}
}

View File

@ -0,0 +1,76 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morpholgy.dictionary;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
//todo spleet this class on two.
public class GrammaReader {
private String fileName;
private String fileEncoding = "windows-1251";
private List<String> grammaInfo = new ArrayList<String>();
private Map<String, Integer> inversIndex = new HashMap<String, Integer>();
public GrammaReader(String fileName) throws IOException {
this.fileName = fileName;
setUp();
}
public GrammaReader(String fileName, String fileEncoding) throws IOException {
this.fileName = fileName;
this.fileEncoding = fileEncoding;
setUp();
}
private void setUp() throws IOException {
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), fileEncoding));
String line = bufferedReader.readLine();
while (line != null) {
line = line.trim();
if (!line.startsWith("//") && line.length() > 0) {
String[] strings = line.split(" ", 2);
Integer i = grammaInfo.size();
inversIndex.put(strings[0], i);
grammaInfo.add(i, strings[1]);
}
line = bufferedReader.readLine();
}
}
public List<String> getGrammaInfo() {
return grammaInfo;
}
public String[] getGrammaInfoAsArray() {
return grammaInfo.toArray(new String[grammaInfo.size()]);
}
public Map<String, Integer> getGrammInversIndex() {
return inversIndex;
}
public void setInversIndex(Map<String, Integer> inversIndex) {
this.inversIndex = inversIndex;
}
}

View File

@ -0,0 +1,54 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morpholgy.dictionary;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Set;
public class IgnoredFormReader {
private String fileName;
private String fileEncoding = "windows-1251";
public IgnoredFormReader(String fileName) {
this.fileName = fileName;
}
public IgnoredFormReader(String fileName, String fileEncoding) {
this.fileName = fileName;
this.fileEncoding = fileEncoding;
}
public Set<String> getIngnoredFroms() throws IOException {
BufferedReader bufferedReader = new BufferedReader(
new InputStreamReader(
new FileInputStream(fileName), fileEncoding));
String s = bufferedReader.readLine();
HashSet<String> result = new HashSet<String>();
while (s != null) {
if (!s.startsWith("//")) {
result.add(s.trim().split(" ")[0]);
}
s = bufferedReader.readLine();
}
return result;
}
}

View File

@ -0,0 +1,147 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morpholgy.dictionary;
import org.apache.lucene.morphology.Heuristic;
import org.apache.lucene.morphology.LetterDecoderEncoder;
import org.apache.lucene.morphology.Morph;
import java.io.IOException;
import java.util.*;
public class StatiticsCollector implements WordProccessor {
private TreeMap<String, Set<Heuristic>> inversIndex = new TreeMap<String, Set<Heuristic>>();
private Map<Set<Heuristic>, Integer> ruleInverIndex = new HashMap<Set<Heuristic>, Integer>();
private List<Set<Heuristic>> rules = new ArrayList<Set<Heuristic>>();
private GrammaReader grammaReader;
private LetterDecoderEncoder decoderEncoder;
public StatiticsCollector(GrammaReader grammaReader) {
this.grammaReader = grammaReader;
}
public void proccess(WordCard wordCard) throws IOException {
wordCard = cleanWordCard(wordCard);
String normalStringMorph = wordCard.getWordsFroms().get(0).getCode();
String word = wordCard.getBase() + wordCard.getCanonicalSuffix();
if (word.contains("-")) return;
for (FlexiaModel fm : wordCard.getWordsFroms()) {
Heuristic heuristic = createEvristic(wordCard.getBase(), wordCard.getCanonicalSuffix(), fm, normalStringMorph);
String form = revertWord(fm.create(wordCard.getBase()));
Set<Heuristic> suffixHeuristics = inversIndex.get(form);
if (suffixHeuristics == null) {
suffixHeuristics = new HashSet<Heuristic>();
inversIndex.put(form, suffixHeuristics);
}
suffixHeuristics.add(heuristic);
}
}
private WordCard cleanWordCard(WordCard wordCard) {
return wordCard;
}
public void saveHeuristic() throws IOException {
Map<Integer, Integer> dist = new TreeMap<Integer, Integer>();
Set<Heuristic> prevSet = null;
int count = 0;
for (String key : inversIndex.keySet()) {
Set<Heuristic> currentSet = inversIndex.get(key);
if (!currentSet.equals(prevSet)) {
Integer d = dist.get(key.length());
dist.put(key.length(), 1 + (d == null ? 0 : d));
prevSet = currentSet;
count++;
if (!ruleInverIndex.containsKey(currentSet)) {
ruleInverIndex.put(currentSet, rules.size());
rules.add(currentSet);
}
}
}
System.out.println("Word with diffirent rules " + count);
System.out.println("All ivers words " + inversIndex.size());
System.out.println(dist);
System.out.println("diffirent rule count " + ruleInverIndex.size());
Heuristic[][] heuristics = new Heuristic[ruleInverIndex.size()][];
int index = 0;
for (Set<Heuristic> hs : rules) {
heuristics[index] = new Heuristic[hs.size()];
int indexj = 0;
for (Heuristic h : hs) {
heuristics[index][indexj] = h;
indexj++;
}
index++;
}
int[][] ints = new int[count][];
short[] rulesId = new short[count];
count = 0;
prevSet = null;
for (String key : inversIndex.keySet()) {
Set<Heuristic> currentSet = inversIndex.get(key);
if (!currentSet.equals(prevSet)) {
ints[count] = decoderEncoder.encodeToArray(key);
rulesId[count] = (short) ruleInverIndex.get(currentSet).intValue();
count++;
prevSet = currentSet;
}
}
Morph morph = new Morph(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray());
morph.writeToFile("sep.txt");
}
private String revertWord(String s) {
String result = "";
for (int i = 1; i <= s.length(); i++) {
result += s.charAt(s.length() - i);
}
return result;
}
private Heuristic createEvristic(String wordBase, String canonicalSuffix, FlexiaModel fm, String normalSuffixForm) {
String form = fm.create(wordBase);
String normalForm = wordBase + canonicalSuffix;
Integer length = getCommonLength(form, normalForm);
Integer actualSuffixLengh = form.length() - length;
String actualNormalSuffix = normalForm.substring(length);
Integer integer = grammaReader.getGrammInversIndex().get(fm.getCode().substring(0, 2));
Integer nf = grammaReader.getGrammInversIndex().get(normalSuffixForm.substring(0, 2));
return new Heuristic((byte) actualSuffixLengh.intValue(), actualNormalSuffix, (short) integer.intValue(), (short) nf.intValue());
}
public static Integer getCommonLength(String s1, String s2) {
Integer length = Math.min(s1.length(), s2.length());
for (int i = 0; i < length; i++) {
if (s1.charAt(i) != s2.charAt(i)) return i;
}
return length;
}
private String cleanString(String s) {
return decoderEncoder.cleanString(s);
//return s.replace((char) (34 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET));
}
}

View File

@ -0,0 +1,72 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morpholgy.dictionary;
import java.util.ArrayList;
import java.util.List;
/**
* Represent word and all it forms.
*/
public class WordCard {
private String canonicalFrom;
private String base;
private String canonicalSuffix;
private List<FlexiaModel> wordsFroms = new ArrayList<FlexiaModel>();
public WordCard(String canonicalFrom, String base, String canonicalSuffix) {
this.canonicalFrom = canonicalFrom;
this.canonicalSuffix = canonicalSuffix;
this.base = base;
}
public void addFlexia(FlexiaModel flexiaModel) {
wordsFroms.add(flexiaModel);
}
public String getCanonicalFrom() {
return canonicalFrom;
}
public String getCanonicalSuffix() {
return canonicalSuffix;
}
public String getBase() {
return base;
}
public List<FlexiaModel> getWordsFroms() {
return wordsFroms;
}
public void setCanonicalFrom(String canonicalFrom) {
this.canonicalFrom = canonicalFrom;
}
public void setBase(String base) {
this.base = base;
}
public void setCanonicalSuffix(String canonicalSuffix) {
this.canonicalSuffix = canonicalSuffix;
}
public void setWordsFroms(List<FlexiaModel> wordsFroms) {
this.wordsFroms = wordsFroms;
}
}

View File

@ -0,0 +1,28 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morpholgy.dictionary;
import java.io.IOException;
/**
* Interface allows get information from
* {@org.apache.lucene.russian.morphology.dictonary.DirtonaryReader}.
*/
public interface WordProccessor {
public void proccess(WordCard wordCard) throws IOException;
}