adding morph classes for language

git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@64 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
alexander.a.kuznetsov 2009-10-19 14:46:47 +00:00
parent c858d2560a
commit 9ee47e2f99
14 changed files with 135 additions and 26 deletions

View File

@ -19,7 +19,7 @@ package org.apache.lucene.morphology.dictionary;
import org.apache.lucene.morphology.Heuristic; import org.apache.lucene.morphology.Heuristic;
import org.apache.lucene.morphology.LetterDecoderEncoder; import org.apache.lucene.morphology.LetterDecoderEncoder;
import org.apache.lucene.morphology.Morph; import org.apache.lucene.morphology.Morphology;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.*;
@ -119,8 +119,8 @@ public class StatiticsCollector implements WordProccessor {
prevSet = currentSet; prevSet = currentSet;
} }
} }
Morph morph = new Morph(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray()); Morphology morphology = new Morphology(ints, rulesId, heuristics, grammaReader.getGrammaInfoAsArray());
morph.writeToFile(fileName); morphology.writeToFile(fileName);
} }
private String revertWord(String s) { private String revertWord(String s) {

View File

@ -28,7 +28,7 @@ import java.util.HashSet;
public class EnglishHeuristicBuilder { public class EnglishHeuristicBuilder {
public static void main(String[] args) throws IOException { public static void main(String[] args) throws IOException {
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/egramtab.tab"); GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morphology/egramtab.tab");
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>()); DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/EngSrc/morphs.mrd", new HashSet<String>());
EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder(); EnglishLetterDecoderEncoder decoderEncoder = new EnglishLetterDecoderEncoder();

View File

@ -27,7 +27,7 @@ import java.util.HashSet;
public class RussianHeuristicBuilder { public class RussianHeuristicBuilder {
public static void main(String[] args) throws IOException { public static void main(String[] args) throws IOException {
GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morph/rgramtab.tab"); GrammaReader grammaInfo = new GrammaReader("dictonary/Dicts/Morphology/rgramtab.tab");
DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>()); DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", new HashSet<String>());
RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder(); RussianLetterDecoderEncoder decoderEncoder = new RussianLetterDecoderEncoder();

View File

@ -22,7 +22,6 @@ import org.apache.lucene.morphology.WrongCharaterException;
import java.util.ArrayList; import java.util.ArrayList;
//todo extract supper class for common method with russian letter decoder
public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder { public class EnglishLetterDecoderEncoder implements LetterDecoderEncoder {
public static final int ENGLISH_SMALL_LETTER_OFFSET = 96; public static final int ENGLISH_SMALL_LETTER_OFFSET = 96;
static public int SUFFIX_LENGTH = 6; static public int SUFFIX_LENGTH = 6;

View File

@ -0,0 +1,28 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.english;
import org.apache.lucene.morphology.LuceneMorphology;
import java.io.IOException;
public class EnglishLuceneMorphology extends LuceneMorphology {
public EnglishLuceneMorphology() throws IOException {
super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
}
}

View File

@ -0,0 +1,28 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.english;
import org.apache.lucene.morphology.Morphology;
import java.io.IOException;
public class EnglishMorphology extends Morphology {
public EnglishMorphology() throws IOException {
super(EnglishLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
}
}

View File

@ -15,7 +15,7 @@
*/ */
package org.apache.lucene.morphology.english; package org.apache.lucene.morphology.english;
import org.apache.lucene.morphology.LuceneMorph; import org.apache.lucene.morphology.LuceneMorphology;
import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.equalTo;
import static org.junit.Assert.assertThat; import static org.junit.Assert.assertThat;
import org.junit.Before; import org.junit.Before;
@ -29,11 +29,11 @@ import java.util.HashSet;
import java.util.Set; import java.util.Set;
public class EnglishLuceneMorphTest { public class EnglishLuceneMorphTest {
private LuceneMorph luceneMorph; private LuceneMorphology luceneMorph;
@Before @Before
public void setUp() throws IOException { public void setUp() throws IOException {
luceneMorph = new LuceneMorph(this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder()); luceneMorph = new LuceneMorphology(this.getClass().getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new EnglishLetterDecoderEncoder());
} }
@Test @Test

View File

@ -23,13 +23,13 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
public class LuceneMorph extends Morph { public class LuceneMorphology extends Morphology {
public LuceneMorph(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException { public LuceneMorphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
super(fileName, decoderEncoder); super(fileName, decoderEncoder);
} }
public LuceneMorph(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException { public LuceneMorphology(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
super(inputStream, decoderEncoder); super(inputStream, decoderEncoder);
} }

View File

@ -21,7 +21,7 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
public class Morph { public class Morphology {
protected int[][] separators; protected int[][] separators;
protected short[] rulesId; protected short[] rulesId;
protected Heuristic[][] rules; protected Heuristic[][] rules;
@ -29,17 +29,17 @@ public class Morph {
protected LetterDecoderEncoder decoderEncoder; protected LetterDecoderEncoder decoderEncoder;
public Morph(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException { public Morphology(String fileName, LetterDecoderEncoder decoderEncoder) throws IOException {
readFromFile(fileName); readFromFile(fileName);
this.decoderEncoder = decoderEncoder; this.decoderEncoder = decoderEncoder;
} }
public Morph(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException { public Morphology(InputStream inputStream, LetterDecoderEncoder decoderEncoder) throws IOException {
readFromInputStream(inputStream); readFromInputStream(inputStream);
this.decoderEncoder = decoderEncoder; this.decoderEncoder = decoderEncoder;
} }
public Morph(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) { public Morphology(int[][] separators, short[] rulesId, Heuristic[][] rules, String[] grammaInfo) {
this.separators = separators; this.separators = separators;
this.rulesId = rulesId; this.rulesId = rulesId;
this.rules = rules; this.rules = rules;

View File

@ -22,21 +22,21 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.morphology.LetterDecoderEncoder; import org.apache.lucene.morphology.LetterDecoderEncoder;
import org.apache.lucene.morphology.LuceneMorph; import org.apache.lucene.morphology.LuceneMorphology;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.Reader; import java.io.Reader;
public class MorphlogyAnalayzer extends Analyzer { public class MorphlogyAnalayzer extends Analyzer {
private LuceneMorph luceneMorph; private LuceneMorphology luceneMorph;
public MorphlogyAnalayzer(String pathToMorph, LetterDecoderEncoder letterDecoderEncoder) throws IOException { public MorphlogyAnalayzer(String pathToMorph, LetterDecoderEncoder letterDecoderEncoder) throws IOException {
luceneMorph = new LuceneMorph(pathToMorph, letterDecoderEncoder); luceneMorph = new LuceneMorphology(pathToMorph, letterDecoderEncoder);
} }
public MorphlogyAnalayzer(InputStream inputStream, LetterDecoderEncoder letterDecoderEncoder) throws IOException { public MorphlogyAnalayzer(InputStream inputStream, LetterDecoderEncoder letterDecoderEncoder) throws IOException {
luceneMorph = new LuceneMorph(inputStream, letterDecoderEncoder); luceneMorph = new LuceneMorphology(inputStream, letterDecoderEncoder);
} }
public TokenStream tokenStream(String fieldName, Reader reader) { public TokenStream tokenStream(String fieldName, Reader reader) {

View File

@ -19,7 +19,7 @@ package org.apache.lucene.morphology.analayzer;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.morphology.LuceneMorph; import org.apache.lucene.morphology.LuceneMorphology;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
@ -27,9 +27,9 @@ import java.util.List;
public class MorphlogyFilter extends TokenFilter { public class MorphlogyFilter extends TokenFilter {
private LuceneMorph luceneMorph; private LuceneMorphology luceneMorph;
public MorphlogyFilter(TokenStream tokenStream, LuceneMorph luceneMorph) { public MorphlogyFilter(TokenStream tokenStream, LuceneMorphology luceneMorph) {
super(tokenStream); super(tokenStream);
this.luceneMorph = luceneMorph; this.luceneMorph = luceneMorph;
} }

View File

@ -0,0 +1,27 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.russian;
import org.apache.lucene.morphology.LuceneMorphology;
import java.io.IOException;
public class RussianLuceneMorphology extends LuceneMorphology {
public RussianLuceneMorphology() throws IOException {
super(RussianLuceneMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new RussianLetterDecoderEncoder());
}
}

View File

@ -0,0 +1,27 @@
/**
* Copyright 2009 Alexander Kuznetsov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.morphology.russian;
import org.apache.lucene.morphology.Morphology;
import java.io.IOException;
public class RussianMorphology extends Morphology {
public RussianMorphology() throws IOException {
super(RussianMorphology.class.getResourceAsStream("/org/apache/lucene/morphology/english/morph.info"), new RussianLetterDecoderEncoder());
}
}

View File

@ -15,7 +15,7 @@
*/ */
package org.apache.lucene.morphology.russian; package org.apache.lucene.morphology.russian;
import org.apache.lucene.morphology.LuceneMorph; import org.apache.lucene.morphology.LuceneMorphology;
import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.equalTo;
import static org.junit.Assert.assertThat; import static org.junit.Assert.assertThat;
import org.junit.Before; import org.junit.Before;
@ -29,11 +29,11 @@ import java.util.HashSet;
import java.util.Set; import java.util.Set;
public class RussianLuceneMorphTest { public class RussianLuceneMorphTest {
private LuceneMorph luceneMorph; private LuceneMorphology luceneMorph;
@Before @Before
public void setUp() throws IOException { public void setUp() throws IOException {
luceneMorph = new LuceneMorph(this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder()); luceneMorph = new LuceneMorphology(this.getClass().getResourceAsStream("/org/apache/lucene/morphology/russian/morph.info"), new RussianLetterDecoderEncoder());
} }
@Test @Test