From ca1a9be6b77bf214293e5362f7aa9b1567cec55e Mon Sep 17 00:00:00 2001 From: "alexander.a.kuznetsov" Date: Fri, 2 Oct 2009 17:02:52 +0000 Subject: [PATCH] adding clean for strings git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@50 d817d54c-26ab-11de-abc9-2f7d1455ff7a --- .../dictionary/StatiticsCollector.java | 17 ++++++++++++----- .../morphology/russian/HeuristicBuilder.java | 3 ++- .../russian/RussianSuffixDecoderEncoder.java | 2 +- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/StatiticsCollector.java b/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/StatiticsCollector.java index b6ef5f8..e913a11 100644 --- a/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/StatiticsCollector.java +++ b/dictionary-reader/src/main/java/org/apache/lucene/morpholgy/dictionary/StatiticsCollector.java @@ -33,12 +33,13 @@ public class StatiticsCollector implements WordProccessor { private LetterDecoderEncoder decoderEncoder; - public StatiticsCollector(GrammaReader grammaReader) { + public StatiticsCollector(GrammaReader grammaReader, LetterDecoderEncoder decoderEncoder) { this.grammaReader = grammaReader; + this.decoderEncoder = decoderEncoder; } public void proccess(WordCard wordCard) throws IOException { - wordCard = cleanWordCard(wordCard); + cleanWordCard(wordCard); String normalStringMorph = wordCard.getWordsFroms().get(0).getCode(); String word = wordCard.getBase() + wordCard.getCanonicalSuffix(); if (word.contains("-")) return; @@ -55,8 +56,15 @@ public class StatiticsCollector implements WordProccessor { } } - private WordCard cleanWordCard(WordCard wordCard) { - return wordCard; + private void cleanWordCard(WordCard wordCard) { + wordCard.setBase(cleanString(wordCard.getBase())); + wordCard.setCanonicalFrom(cleanString(wordCard.getCanonicalFrom())); + wordCard.setCanonicalSuffix(cleanString(wordCard.getCanonicalSuffix())); + List models = wordCard.getWordsFroms(); + for (FlexiaModel m : models) { + m.setSuffix(cleanString(m.getSuffix())); + m.setPrefix(cleanString(m.getPrefix())); + } } @@ -141,7 +149,6 @@ public class StatiticsCollector implements WordProccessor { private String cleanString(String s) { return decoderEncoder.cleanString(s); - //return s.replace((char) (34 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET)); } } diff --git a/russian/src/main/java/org/apache/lucene/morphology/russian/HeuristicBuilder.java b/russian/src/main/java/org/apache/lucene/morphology/russian/HeuristicBuilder.java index f38a335..9ac57d2 100644 --- a/russian/src/main/java/org/apache/lucene/morphology/russian/HeuristicBuilder.java +++ b/russian/src/main/java/org/apache/lucene/morphology/russian/HeuristicBuilder.java @@ -31,7 +31,8 @@ public class HeuristicBuilder { GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab"); DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form); - StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo); + RussianSuffixDecoderEncoder decoderEncoder = new RussianSuffixDecoderEncoder(); + StatiticsCollector statiticsCollector = new StatiticsCollector(grammaInfo, decoderEncoder); dictonaryReader.proccess(statiticsCollector); statiticsCollector.saveHeuristic(); diff --git a/russian/src/main/java/org/apache/lucene/morphology/russian/RussianSuffixDecoderEncoder.java b/russian/src/main/java/org/apache/lucene/morphology/russian/RussianSuffixDecoderEncoder.java index 6c0240d..d2f3947 100644 --- a/russian/src/main/java/org/apache/lucene/morphology/russian/RussianSuffixDecoderEncoder.java +++ b/russian/src/main/java/org/apache/lucene/morphology/russian/RussianSuffixDecoderEncoder.java @@ -107,6 +107,6 @@ public class RussianSuffixDecoderEncoder implements LetterDecoderEncoder { } public String cleanString(String s) { - return s; + return s.replace((char) (34 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET), (char) (6 + RussianSuffixDecoderEncoder.RUSSIAN_SMALL_LETTER_OFFSET)); } }