adding builed russian suffix evresitcs

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@4 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov
2009-04-11 20:54:21 +00:00
parent b334960f5d
commit 5214a0b7f4
3 changed files with 694938 additions and 24 deletions

121
data/igoredFrom.txt Normal file
View File

@ -0,0 +1,121 @@
// <20><><EFBFBD><EFBFBD><EFBFBD>
<EFBFBD><EFBFBD> C <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> C <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> C <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> C <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> C <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> C <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> C <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,<2C><><EFBFBD><EFBFBD>
<EFBFBD><EFBFBD> C <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> C <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> C <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> C <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> C <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> C <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> I <20> <20><>,<2C><><EFBFBD>,0
<EFBFBD><EFBFBD> E <20> <20><>-<2D><>,<2C><><EFBFBD>,0
<EFBFBD><EFBFBD> E <20> <20><>-<2D><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> E <20> <20><>-<2D><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> E <20> <20><>-<2D><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> E <20> <20><>-<2D><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> E <20> <20><>-<2D><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> E <20> <20><>-<2D><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> E <20> <20><>-<2D><>,<2C><><EFBFBD>,<2C><>,<2C><>,<2C><><EFBFBD><EFBFBD>
<EFBFBD><EFBFBD> E <20> <20><>-<2D><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> E <20> <20><>-<2D><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> E <20> <20><>-<2D><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> E <20> <20><>-<2D><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> E <20> <20><>-<2D><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> E <20> <20><>-<2D><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> I <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> I <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> I <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> I <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> I <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> I <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> I <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,<2C><><EFBFBD><EFBFBD>
<EFBFBD><EFBFBD> I <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> I <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> I <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> I <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> I <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> I <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>
<EFBFBD><EFBFBD> I <20> <20><>,<2C><><EFBFBD>,0
// <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
// <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
// <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
// <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><>,<2C><>,
// <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> (<28><><EFBFBD><EFBFBD>.)
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
// <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> (<28><><EFBFBD><EFBFBD>.)
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,
<EFBFBD><EFBFBD> Q <20> <20><>,<2C><><EFBFBD>,<2C><><EFBFBD><EFBFBD>,<2C><>,<2C><>,

View File

@ -13,12 +13,12 @@ import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
public class SuffixResearcher {
public class EvristicBuilder {
public static void main(String[] args) throws IOException {
IgnoredFormReader formReader = new IgnoredFormReader("igoredFrom.txt");
IgnoredFormReader formReader = new IgnoredFormReader("data/igoredFrom.txt");
Set<String> form = formReader.getIngnoredFroms();
DictonaryReader dictonaryReader = new DictonaryReader("morphs.mrd", form);
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form);
StatiticsCollectors statiticsCollectors = new StatiticsCollectors();
dictonaryReader.proccess(statiticsCollectors);
Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values();
@ -34,26 +34,6 @@ public class SuffixResearcher {
evristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic());
}
final AtomicInteger good = new AtomicInteger(0);
final AtomicInteger bad = new AtomicInteger(0);
final FileWriter writer = new FileWriter("incorret.txt");
dictonaryReader.proccess(new WordProccessor(){
public void proccess(WordCard wordCard) throws IOException {
for(String wordForm:wordCard.getWordsFroms()){
String cf = wordCard.getCanonicalFrom();
if (evristic.getNormalForm(wordForm).equals(cf)){
good.incrementAndGet();
} else{
writer.write(wordForm + " c " + cf + " f " + evristic.getNormalForm(wordForm) + "\n");
bad.incrementAndGet();
}
}
}
});
writer.close();
System.out.println("Good " + good + " Bad " + bad);
evristic.writeToFile("evriticsb");
evristic.writeToFile("src/main/resources/russianSuffixesEvristics.txt");
}
}

File diff suppressed because it is too large Load Diff