some spell cheking fixes
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@24 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
parent
65670d4c9b
commit
c68fbb0827
@ -18,9 +18,9 @@ package org.apache.lucene.russian.morphology;
|
|||||||
|
|
||||||
import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
|
import org.apache.lucene.russian.morphology.dictonary.DictonaryReader;
|
||||||
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
|
import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader;
|
||||||
import org.apache.lucene.russian.morphology.evristics.Evristic;
|
import org.apache.lucene.russian.morphology.heuristic.Heuristic;
|
||||||
import org.apache.lucene.russian.morphology.evristics.StatiticsCollectors;
|
import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors;
|
||||||
import org.apache.lucene.russian.morphology.evristics.SuffixCounter;
|
import org.apache.lucene.russian.morphology.heuristic.SuffixCounter;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
@ -28,7 +28,7 @@ import java.util.Collection;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
|
||||||
public class EvristicBuilder {
|
public class HeuristicBuilder {
|
||||||
public static void main(String[] args) throws IOException {
|
public static void main(String[] args) throws IOException {
|
||||||
IgnoredFormReader formReader = new IgnoredFormReader("data/igoredFrom.txt");
|
IgnoredFormReader formReader = new IgnoredFormReader("data/igoredFrom.txt");
|
||||||
Set<String> form = formReader.getIngnoredFroms();
|
Set<String> form = formReader.getIngnoredFroms();
|
||||||
@ -44,11 +44,11 @@ public class EvristicBuilder {
|
|||||||
System.out.println(objects[i]);
|
System.out.println(objects[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
final Evristic evristic = new Evristic();
|
final Heuristic heuristic = new Heuristic();
|
||||||
for (int i = 0; i < objects.length; i++) {
|
for (int i = 0; i < objects.length; i++) {
|
||||||
evristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
|
heuristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
|
||||||
}
|
}
|
||||||
|
|
||||||
evristic.writeToFile("src/main/resources/org/apache/lucene/russian/morpholgy/russianSuffixesEvristics.txt");
|
heuristic.writeToFile("src/main/resources/org/apache/lucene/russian/morpholgy/russianSuffixesEvristics.txt");
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -26,16 +26,16 @@ import java.io.IOException;
|
|||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
public class RussianMorphlogyAnalayzer extends Analyzer {
|
public class RussianMorphlogyAnalayzer extends Analyzer {
|
||||||
private SuffixEvristics suffixEvristics;
|
private SuffixHeuristic suffixHeuristic;
|
||||||
|
|
||||||
public RussianMorphlogyAnalayzer() throws IOException {
|
public RussianMorphlogyAnalayzer() throws IOException {
|
||||||
suffixEvristics = new SuffixEvristics();
|
suffixHeuristic = new SuffixHeuristic();
|
||||||
}
|
}
|
||||||
|
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
TokenStream result = new StandardTokenizer(reader);
|
TokenStream result = new StandardTokenizer(reader);
|
||||||
result = new StandardFilter(result);
|
result = new StandardFilter(result);
|
||||||
result = new LowerCaseFilter(result);
|
result = new LowerCaseFilter(result);
|
||||||
return new RussianMorphlogyFilter(result, suffixEvristics);
|
return new RussianMorphlogyFilter(result, suffixHeuristic);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -24,11 +24,11 @@ import java.io.IOException;
|
|||||||
|
|
||||||
|
|
||||||
public class RussianMorphlogyFilter extends TokenFilter {
|
public class RussianMorphlogyFilter extends TokenFilter {
|
||||||
private SuffixEvristics suffixEvristics;
|
private SuffixHeuristic suffixHeuristic;
|
||||||
|
|
||||||
public RussianMorphlogyFilter(TokenStream tokenStream, SuffixEvristics suffixEvristics) {
|
public RussianMorphlogyFilter(TokenStream tokenStream, SuffixHeuristic suffixHeuristic) {
|
||||||
super(tokenStream);
|
super(tokenStream);
|
||||||
this.suffixEvristics = suffixEvristics;
|
this.suffixHeuristic = suffixHeuristic;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public Token next(final Token reusableToken) throws IOException {
|
||||||
@ -40,7 +40,7 @@ public class RussianMorphlogyFilter extends TokenFilter {
|
|||||||
return nextToken;
|
return nextToken;
|
||||||
}
|
}
|
||||||
Token current = (Token) nextToken.clone();
|
Token current = (Token) nextToken.clone();
|
||||||
return createToken(suffixEvristics.getCanonicalForm(word), current, reusableToken);
|
return createToken(suffixHeuristic.getCanonicalForm(word), current, reusableToken);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Token createToken(String synonym, Token current, final Token reusableToken) {
|
protected Token createToken(String synonym, Token current, final Token reusableToken) {
|
||||||
|
@ -22,7 +22,7 @@ import java.io.*;
|
|||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
|
||||||
public class SuffixEvristics {
|
public class SuffixHeuristic {
|
||||||
private long[] keys;
|
private long[] keys;
|
||||||
private long[] values;
|
private long[] values;
|
||||||
|
|
||||||
@ -32,11 +32,11 @@ public class SuffixEvristics {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public SuffixEvristics() throws IOException {
|
public SuffixHeuristic() throws IOException {
|
||||||
readFromResource();
|
readFromResource();
|
||||||
}
|
}
|
||||||
|
|
||||||
public SuffixEvristics(String fileName) throws IOException {
|
public SuffixHeuristic(String fileName) throws IOException {
|
||||||
readFromFile(fileName);
|
readFromFile(fileName);
|
||||||
}
|
}
|
||||||
|
|
@ -14,7 +14,7 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology.evristics;
|
package org.apache.lucene.russian.morphology.heuristic;
|
||||||
|
|
||||||
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||||
|
|
||||||
@ -25,14 +25,14 @@ import java.io.IOException;
|
|||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
|
|
||||||
|
|
||||||
public class Evristic {
|
public class Heuristic {
|
||||||
private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();
|
private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>();
|
||||||
|
|
||||||
public void addEvristic(SuffixEvristic suffixEvristic) {
|
public void addEvristic(SuffixHeuristic suffixHeuristic) {
|
||||||
Long suffix = RussianSuffixDecoderEncoder.encode(suffixEvristic.getFormSuffix());
|
Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix());
|
||||||
Long longs = encodedSuffixesPairs.get(suffix);
|
Long longs = encodedSuffixesPairs.get(suffix);
|
||||||
if (longs == null) {
|
if (longs == null) {
|
||||||
encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixEvristic.getNormalSuffix()));
|
encodedSuffixesPairs.put(suffix, RussianSuffixDecoderEncoder.encode(suffixHeuristic.getNormalSuffix()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -14,7 +14,7 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology.evristics;
|
package org.apache.lucene.russian.morphology.heuristic;
|
||||||
|
|
||||||
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
import org.apache.lucene.russian.morphology.RussianSuffixDecoderEncoder;
|
||||||
import org.apache.lucene.russian.morphology.dictonary.WordCard;
|
import org.apache.lucene.russian.morphology.dictonary.WordCard;
|
||||||
@ -25,28 +25,28 @@ import java.util.Map;
|
|||||||
|
|
||||||
|
|
||||||
public class StatiticsCollectors implements WordProccessor {
|
public class StatiticsCollectors implements WordProccessor {
|
||||||
Map<SuffixEvristic, SuffixCounter> statititics = new HashMap<SuffixEvristic, SuffixCounter>();
|
Map<SuffixHeuristic, SuffixCounter> statititics = new HashMap<SuffixHeuristic, SuffixCounter>();
|
||||||
|
|
||||||
private Integer ignoredCount = 0;
|
private Integer ignoredCount = 0;
|
||||||
|
|
||||||
public void proccess(WordCard wordCard) {
|
public void proccess(WordCard wordCard) {
|
||||||
for (String form : wordCard.getWordsFroms()) {
|
for (String form : wordCard.getWordsFroms()) {
|
||||||
SuffixEvristic suffixEvristic = createEvristic(wordCard.getCanonicalFrom(), form);
|
SuffixHeuristic suffixHeuristic = createEvristic(wordCard.getCanonicalFrom(), form);
|
||||||
if (suffixEvristic == null) continue;
|
if (suffixHeuristic == null) continue;
|
||||||
SuffixCounter suffixCounter = statititics.get(suffixEvristic);
|
SuffixCounter suffixCounter = statititics.get(suffixHeuristic);
|
||||||
if (suffixCounter == null) {
|
if (suffixCounter == null) {
|
||||||
suffixCounter = new SuffixCounter(suffixEvristic);
|
suffixCounter = new SuffixCounter(suffixHeuristic);
|
||||||
statititics.put(suffixEvristic, suffixCounter);
|
statititics.put(suffixHeuristic, suffixCounter);
|
||||||
}
|
}
|
||||||
suffixCounter.incrementAmount();
|
suffixCounter.incrementAmount();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<SuffixEvristic, SuffixCounter> getStatititics() {
|
public Map<SuffixHeuristic, SuffixCounter> getStatititics() {
|
||||||
return statititics;
|
return statititics;
|
||||||
}
|
}
|
||||||
|
|
||||||
private SuffixEvristic createEvristic(String word, String form) {
|
private SuffixHeuristic createEvristic(String word, String form) {
|
||||||
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
int startSymbol = form.length() > RussianSuffixDecoderEncoder.SUFFIX_LENGTH ? form.length() - RussianSuffixDecoderEncoder.SUFFIX_LENGTH : 0;
|
||||||
String formSuffix = form.substring(startSymbol);
|
String formSuffix = form.substring(startSymbol);
|
||||||
if (word.length() < startSymbol) {
|
if (word.length() < startSymbol) {
|
||||||
@ -58,7 +58,7 @@ public class StatiticsCollectors implements WordProccessor {
|
|||||||
System.out.println(word + " " + form);
|
System.out.println(word + " " + form);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
return new SuffixEvristic(formSuffix, wordSuffix);
|
return new SuffixHeuristic(formSuffix, wordSuffix);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -14,30 +14,30 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology.evristics;
|
package org.apache.lucene.russian.morphology.heuristic;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Conains information of freqency of suffix evristic
|
* Conains information of freqency of suffix evristic
|
||||||
* in dictionary.
|
* in dictionary.
|
||||||
*/
|
*/
|
||||||
public class SuffixCounter implements Comparable {
|
public class SuffixCounter implements Comparable {
|
||||||
private SuffixEvristic suffixEvristic;
|
private SuffixHeuristic suffixHeuristic;
|
||||||
private Double amnout = 0.0;
|
private Double amnout = 0.0;
|
||||||
|
|
||||||
public SuffixCounter(SuffixEvristic suffixEvristic) {
|
public SuffixCounter(SuffixHeuristic suffixHeuristic) {
|
||||||
this.suffixEvristic = suffixEvristic;
|
this.suffixHeuristic = suffixHeuristic;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void incrementAmount() {
|
public void incrementAmount() {
|
||||||
amnout++;
|
amnout++;
|
||||||
}
|
}
|
||||||
|
|
||||||
public SuffixEvristic getSuffixEvristic() {
|
public SuffixHeuristic getSuffixEvristic() {
|
||||||
return suffixEvristic;
|
return suffixHeuristic;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setSuffixEvristic(SuffixEvristic suffixEvristic) {
|
public void setSuffixEvristic(SuffixHeuristic suffixHeuristic) {
|
||||||
this.suffixEvristic = suffixEvristic;
|
this.suffixHeuristic = suffixHeuristic;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Double getAmnout() {
|
public Double getAmnout() {
|
||||||
@ -55,6 +55,6 @@ public class SuffixCounter implements Comparable {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "" + amnout + " " + suffixEvristic.toString();
|
return "" + amnout + " " + suffixHeuristic.toString();
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -14,7 +14,7 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.russian.morphology.evristics;
|
package org.apache.lucene.russian.morphology.heuristic;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Represent evristic that assume that
|
* Represent evristic that assume that
|
||||||
@ -22,11 +22,11 @@ package org.apache.lucene.russian.morphology.evristics;
|
|||||||
* It contains to suffixes from given position of
|
* It contains to suffixes from given position of
|
||||||
* canonical word form and for form.
|
* canonical word form and for form.
|
||||||
*/
|
*/
|
||||||
public class SuffixEvristic {
|
public class SuffixHeuristic {
|
||||||
private String formSuffix;
|
private String formSuffix;
|
||||||
private String normalSuffix;
|
private String normalSuffix;
|
||||||
|
|
||||||
public SuffixEvristic(String formSuffix, String normalSuffix) {
|
public SuffixHeuristic(String formSuffix, String normalSuffix) {
|
||||||
this.formSuffix = formSuffix;
|
this.formSuffix = formSuffix;
|
||||||
this.normalSuffix = normalSuffix;
|
this.normalSuffix = normalSuffix;
|
||||||
}
|
}
|
||||||
@ -52,7 +52,7 @@ public class SuffixEvristic {
|
|||||||
if (this == o) return true;
|
if (this == o) return true;
|
||||||
if (o == null || getClass() != o.getClass()) return false;
|
if (o == null || getClass() != o.getClass()) return false;
|
||||||
|
|
||||||
SuffixEvristic that = (SuffixEvristic) o;
|
SuffixHeuristic that = (SuffixHeuristic) o;
|
||||||
|
|
||||||
if (!formSuffix.equals(that.formSuffix)) return false;
|
if (!formSuffix.equals(that.formSuffix)) return false;
|
||||||
if (!normalSuffix.equals(that.normalSuffix)) return false;
|
if (!normalSuffix.equals(that.normalSuffix)) return false;
|
||||||
@ -69,7 +69,7 @@ public class SuffixEvristic {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "SuffixEvristic{" +
|
return "SuffixHeuristic{" +
|
||||||
"formSuffix='" + formSuffix + '\'' +
|
"formSuffix='" + formSuffix + '\'' +
|
||||||
", normalSuffix='" + normalSuffix + '\'' +
|
", normalSuffix='" + normalSuffix + '\'' +
|
||||||
'}';
|
'}';
|
@ -30,13 +30,13 @@ public class SuffixEvristicsTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testShouldDefineCorretCononicalWordForm() throws IOException {
|
public void testShouldDefineCorretCononicalWordForm() throws IOException {
|
||||||
SuffixEvristics suffixEvristics = new SuffixEvristics();
|
SuffixHeuristic suffixHeuristic = new SuffixHeuristic();
|
||||||
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt");
|
InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt");
|
||||||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
|
||||||
String s = bufferedReader.readLine();
|
String s = bufferedReader.readLine();
|
||||||
while (s != null) {
|
while (s != null) {
|
||||||
String[] qa = s.trim().split(" ");
|
String[] qa = s.trim().split(" ");
|
||||||
assertThat(suffixEvristics.getCanonicalForm(qa[0]), equalTo(qa[1]));
|
assertThat(suffixHeuristic.getCanonicalForm(qa[0]), equalTo(qa[1]));
|
||||||
s = bufferedReader.readLine();
|
s = bufferedReader.readLine();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user