taking into account word text statistics
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@25 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
		
							
								
								
									
										32459
									
								
								data/lemma.num
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32459
									
								
								data/lemma.num
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -17,6 +17,7 @@ | ||||
| package org.apache.lucene.russian.morphology; | ||||
|  | ||||
| import org.apache.lucene.russian.morphology.dictonary.DictonaryReader; | ||||
| import org.apache.lucene.russian.morphology.dictonary.FrequentyReader; | ||||
| import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader; | ||||
| import org.apache.lucene.russian.morphology.heuristic.Heuristic; | ||||
| import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors; | ||||
| @@ -33,8 +34,11 @@ public class HeuristicBuilder { | ||||
|         IgnoredFormReader formReader = new IgnoredFormReader("data/igoredFrom.txt"); | ||||
|         Set<String> form = formReader.getIngnoredFroms(); | ||||
|  | ||||
|         FrequentyReader frequentyReader = new FrequentyReader("data/lemma.num"); | ||||
|  | ||||
|         DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form); | ||||
|         StatiticsCollectors statiticsCollectors = new StatiticsCollectors(); | ||||
|  | ||||
|         StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read()); | ||||
|         dictonaryReader.proccess(statiticsCollectors); | ||||
|         Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values(); | ||||
|         Object[] objects = counterCollection.toArray(); | ||||
| @@ -46,9 +50,9 @@ public class HeuristicBuilder { | ||||
|  | ||||
|         final Heuristic heuristic = new Heuristic(); | ||||
|         for (int i = 0; i < objects.length; i++) { | ||||
|             heuristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic()); | ||||
|             heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic()); | ||||
|         } | ||||
|  | ||||
|         heuristic.writeToFile("src/main/resources/org/apache/lucene/russian/morpholgy/russianSuffixesEvristics.txt"); | ||||
|         heuristic.writeToFile("russianSuffixesHeuristic.txt"); | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -41,7 +41,7 @@ public class SuffixHeuristic { | ||||
|     } | ||||
|  | ||||
|     public void readFromResource() throws IOException { | ||||
|         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/russianSuffixesEvristics.txt"); | ||||
|         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/russianSuffixesHeuristic.txt"); | ||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream)); | ||||
|         readFromBufferedRreader(bufferedReader); | ||||
|     } | ||||
|   | ||||
| @@ -0,0 +1,56 @@ | ||||
| /** | ||||
|  * Copyright 2009 Alexander Kuznetsov | ||||
|  * | ||||
|  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|  * you may not use this file except in compliance with the License. | ||||
|  * You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
|  | ||||
| package org.apache.lucene.russian.morphology.dictonary; | ||||
|  | ||||
| import java.io.BufferedReader; | ||||
| import java.io.FileInputStream; | ||||
| import java.io.IOException; | ||||
| import java.io.InputStreamReader; | ||||
| import java.util.HashMap; | ||||
| import java.util.Map; | ||||
|  | ||||
|  | ||||
| public class FrequentyReader { | ||||
|     private String fileName; | ||||
|     private String fileEncoding = "windows-1251"; | ||||
|  | ||||
|     public FrequentyReader(String fileName) { | ||||
|         this.fileName = fileName; | ||||
|     } | ||||
|  | ||||
|     public FrequentyReader(String fileName, String fileEncoding) { | ||||
|         this.fileName = fileName; | ||||
|         this.fileEncoding = fileEncoding; | ||||
|     } | ||||
|  | ||||
|  | ||||
|     public Map<String, Double> read() throws IOException { | ||||
|         Map<String, Double> result = new HashMap<String, Double>(); | ||||
|  | ||||
|         BufferedReader bufferedReader = new BufferedReader( | ||||
|                 new InputStreamReader( | ||||
|                         new FileInputStream(fileName), fileEncoding)); | ||||
|         String s = bufferedReader.readLine(); | ||||
|         while (s != null) { | ||||
|             String[] strings = s.split(" "); | ||||
|             Double value = Double.valueOf(strings[1]); | ||||
|             result.put(strings[2], value); | ||||
|             s = bufferedReader.readLine(); | ||||
|         } | ||||
|         return result; | ||||
|     } | ||||
| } | ||||
| @@ -28,7 +28,7 @@ import java.util.TreeMap; | ||||
| public class Heuristic { | ||||
|     private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>(); | ||||
|  | ||||
|     public void addEvristic(SuffixHeuristic suffixHeuristic) { | ||||
|     public void addHeuristic(SuffixHeuristic suffixHeuristic) { | ||||
|         Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix()); | ||||
|         Long longs = encodedSuffixesPairs.get(suffix); | ||||
|         if (longs == null) { | ||||
|   | ||||
| @@ -26,6 +26,11 @@ import java.util.Map; | ||||
|  | ||||
| public class StatiticsCollectors implements WordProccessor { | ||||
|     Map<SuffixHeuristic, SuffixCounter> statititics = new HashMap<SuffixHeuristic, SuffixCounter>(); | ||||
|     private Map<String, Double> wordsFreq; | ||||
|  | ||||
|     public StatiticsCollectors(Map<String, Double> wordsFreq) { | ||||
|         this.wordsFreq = wordsFreq; | ||||
|     } | ||||
|  | ||||
|     private Integer ignoredCount = 0; | ||||
|  | ||||
| @@ -38,8 +43,14 @@ public class StatiticsCollectors implements WordProccessor { | ||||
|                 suffixCounter = new SuffixCounter(suffixHeuristic); | ||||
|                 statititics.put(suffixHeuristic, suffixCounter); | ||||
|             } | ||||
|             Double freq = wordsFreq.get(wordCard.getCanonicalFrom()); | ||||
|             if (freq != null) { | ||||
|                 suffixCounter.incrementAmount(1 + Math.log(freq)); | ||||
|             } else { | ||||
|                 suffixCounter.incrementAmount(); | ||||
|             } | ||||
|  | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     public Map<SuffixHeuristic, SuffixCounter> getStatititics() { | ||||
|   | ||||
| @@ -32,7 +32,11 @@ public class SuffixCounter implements Comparable { | ||||
|         amnout++; | ||||
|     } | ||||
|  | ||||
|     public SuffixHeuristic getSuffixEvristic() { | ||||
|     public void incrementAmount(Double wordFreq) { | ||||
|         amnout += wordFreq; | ||||
|     } | ||||
|  | ||||
|     public SuffixHeuristic getSuffixHeuristic() { | ||||
|         return suffixHeuristic; | ||||
|     } | ||||
|  | ||||
|   | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -26,12 +26,12 @@ import java.io.InputStream; | ||||
| import java.io.InputStreamReader; | ||||
| 
 | ||||
| 
 | ||||
| public class SuffixEvristicsTest { | ||||
| public class SuffixHeuristicTest { | ||||
| 
 | ||||
|     @Test | ||||
|     public void testShouldDefineCorretCononicalWordForm() throws IOException { | ||||
|         SuffixHeuristic suffixHeuristic = new SuffixHeuristic(); | ||||
|         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt"); | ||||
|         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-heuristic-test-data.txt"); | ||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); | ||||
|         String s = bufferedReader.readLine(); | ||||
|         while (s != null) { | ||||
| @@ -5,3 +5,4 @@ | ||||
| поэтическая поэтический | ||||
| произошло произойти | ||||
| test test | ||||
| ананасов ананас | ||||
| @@ -9,7 +9,7 @@ | ||||
| заход | ||||
| на | ||||
| посадка | ||||
| все | ||||
| весь | ||||
| нормальный | ||||
| быть | ||||
| рекомендовать | ||||
| @@ -25,7 +25,7 @@ | ||||
| новолазаревский | ||||
| антарктида | ||||
| совершаться | ||||
| примерный | ||||
| примерно | ||||
| один | ||||
| раз | ||||
| в | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Alexander.A.Kuznetsov
					Alexander.A.Kuznetsov