taking into account word text statistics
git-svn-id: https://russianmorphology.googlecode.com/svn/trunk@25 d817d54c-26ab-11de-abc9-2f7d1455ff7a
This commit is contained in:
		
							
								
								
									
										32459
									
								
								data/lemma.num
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32459
									
								
								data/lemma.num
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -17,6 +17,7 @@ | |||||||
| package org.apache.lucene.russian.morphology; | package org.apache.lucene.russian.morphology; | ||||||
|  |  | ||||||
| import org.apache.lucene.russian.morphology.dictonary.DictonaryReader; | import org.apache.lucene.russian.morphology.dictonary.DictonaryReader; | ||||||
|  | import org.apache.lucene.russian.morphology.dictonary.FrequentyReader; | ||||||
| import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader; | import org.apache.lucene.russian.morphology.dictonary.IgnoredFormReader; | ||||||
| import org.apache.lucene.russian.morphology.heuristic.Heuristic; | import org.apache.lucene.russian.morphology.heuristic.Heuristic; | ||||||
| import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors; | import org.apache.lucene.russian.morphology.heuristic.StatiticsCollectors; | ||||||
| @@ -33,8 +34,11 @@ public class HeuristicBuilder { | |||||||
|         IgnoredFormReader formReader = new IgnoredFormReader("data/igoredFrom.txt"); |         IgnoredFormReader formReader = new IgnoredFormReader("data/igoredFrom.txt"); | ||||||
|         Set<String> form = formReader.getIngnoredFroms(); |         Set<String> form = formReader.getIngnoredFroms(); | ||||||
|  |  | ||||||
|  |         FrequentyReader frequentyReader = new FrequentyReader("data/lemma.num"); | ||||||
|  |  | ||||||
|         DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form); |         DictonaryReader dictonaryReader = new DictonaryReader("dictonary/Dicts/SrcMorph/RusSrc/morphs.mrd", form); | ||||||
|         StatiticsCollectors statiticsCollectors = new StatiticsCollectors(); |  | ||||||
|  |         StatiticsCollectors statiticsCollectors = new StatiticsCollectors(frequentyReader.read()); | ||||||
|         dictonaryReader.proccess(statiticsCollectors); |         dictonaryReader.proccess(statiticsCollectors); | ||||||
|         Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values(); |         Collection<SuffixCounter> counterCollection = statiticsCollectors.getStatititics().values(); | ||||||
|         Object[] objects = counterCollection.toArray(); |         Object[] objects = counterCollection.toArray(); | ||||||
| @@ -46,9 +50,9 @@ public class HeuristicBuilder { | |||||||
|  |  | ||||||
|         final Heuristic heuristic = new Heuristic(); |         final Heuristic heuristic = new Heuristic(); | ||||||
|         for (int i = 0; i < objects.length; i++) { |         for (int i = 0; i < objects.length; i++) { | ||||||
|             heuristic.addEvristic(((SuffixCounter) objects[i]).getSuffixEvristic()); |             heuristic.addHeuristic(((SuffixCounter) objects[i]).getSuffixHeuristic()); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         heuristic.writeToFile("src/main/resources/org/apache/lucene/russian/morpholgy/russianSuffixesEvristics.txt"); |         heuristic.writeToFile("russianSuffixesHeuristic.txt"); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -41,7 +41,7 @@ public class SuffixHeuristic { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     public void readFromResource() throws IOException { |     public void readFromResource() throws IOException { | ||||||
|         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/russianSuffixesEvristics.txt"); |         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/russianSuffixesHeuristic.txt"); | ||||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream)); |         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream)); | ||||||
|         readFromBufferedRreader(bufferedReader); |         readFromBufferedRreader(bufferedReader); | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -0,0 +1,56 @@ | |||||||
|  | /** | ||||||
|  |  * Copyright 2009 Alexander Kuznetsov | ||||||
|  |  * | ||||||
|  |  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |  * you may not use this file except in compliance with the License. | ||||||
|  |  * You may obtain a copy of the License at | ||||||
|  |  * | ||||||
|  |  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  |  * | ||||||
|  |  * Unless required by applicable law or agreed to in writing, software | ||||||
|  |  * distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |  * See the License for the specific language governing permissions and | ||||||
|  |  * limitations under the License. | ||||||
|  |  */ | ||||||
|  |  | ||||||
|  | package org.apache.lucene.russian.morphology.dictonary; | ||||||
|  |  | ||||||
|  | import java.io.BufferedReader; | ||||||
|  | import java.io.FileInputStream; | ||||||
|  | import java.io.IOException; | ||||||
|  | import java.io.InputStreamReader; | ||||||
|  | import java.util.HashMap; | ||||||
|  | import java.util.Map; | ||||||
|  |  | ||||||
|  |  | ||||||
|  | public class FrequentyReader { | ||||||
|  |     private String fileName; | ||||||
|  |     private String fileEncoding = "windows-1251"; | ||||||
|  |  | ||||||
|  |     public FrequentyReader(String fileName) { | ||||||
|  |         this.fileName = fileName; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     public FrequentyReader(String fileName, String fileEncoding) { | ||||||
|  |         this.fileName = fileName; | ||||||
|  |         this.fileEncoding = fileEncoding; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     public Map<String, Double> read() throws IOException { | ||||||
|  |         Map<String, Double> result = new HashMap<String, Double>(); | ||||||
|  |  | ||||||
|  |         BufferedReader bufferedReader = new BufferedReader( | ||||||
|  |                 new InputStreamReader( | ||||||
|  |                         new FileInputStream(fileName), fileEncoding)); | ||||||
|  |         String s = bufferedReader.readLine(); | ||||||
|  |         while (s != null) { | ||||||
|  |             String[] strings = s.split(" "); | ||||||
|  |             Double value = Double.valueOf(strings[1]); | ||||||
|  |             result.put(strings[2], value); | ||||||
|  |             s = bufferedReader.readLine(); | ||||||
|  |         } | ||||||
|  |         return result; | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -28,7 +28,7 @@ import java.util.TreeMap; | |||||||
| public class Heuristic { | public class Heuristic { | ||||||
|     private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>(); |     private TreeMap<Long, Long> encodedSuffixesPairs = new TreeMap<Long, Long>(); | ||||||
|  |  | ||||||
|     public void addEvristic(SuffixHeuristic suffixHeuristic) { |     public void addHeuristic(SuffixHeuristic suffixHeuristic) { | ||||||
|         Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix()); |         Long suffix = RussianSuffixDecoderEncoder.encode(suffixHeuristic.getFormSuffix()); | ||||||
|         Long longs = encodedSuffixesPairs.get(suffix); |         Long longs = encodedSuffixesPairs.get(suffix); | ||||||
|         if (longs == null) { |         if (longs == null) { | ||||||
|   | |||||||
| @@ -26,6 +26,11 @@ import java.util.Map; | |||||||
|  |  | ||||||
| public class StatiticsCollectors implements WordProccessor { | public class StatiticsCollectors implements WordProccessor { | ||||||
|     Map<SuffixHeuristic, SuffixCounter> statititics = new HashMap<SuffixHeuristic, SuffixCounter>(); |     Map<SuffixHeuristic, SuffixCounter> statititics = new HashMap<SuffixHeuristic, SuffixCounter>(); | ||||||
|  |     private Map<String, Double> wordsFreq; | ||||||
|  |  | ||||||
|  |     public StatiticsCollectors(Map<String, Double> wordsFreq) { | ||||||
|  |         this.wordsFreq = wordsFreq; | ||||||
|  |     } | ||||||
|  |  | ||||||
|     private Integer ignoredCount = 0; |     private Integer ignoredCount = 0; | ||||||
|  |  | ||||||
| @@ -38,7 +43,13 @@ public class StatiticsCollectors implements WordProccessor { | |||||||
|                 suffixCounter = new SuffixCounter(suffixHeuristic); |                 suffixCounter = new SuffixCounter(suffixHeuristic); | ||||||
|                 statititics.put(suffixHeuristic, suffixCounter); |                 statititics.put(suffixHeuristic, suffixCounter); | ||||||
|             } |             } | ||||||
|             suffixCounter.incrementAmount(); |             Double freq = wordsFreq.get(wordCard.getCanonicalFrom()); | ||||||
|  |             if (freq != null) { | ||||||
|  |                 suffixCounter.incrementAmount(1 + Math.log(freq)); | ||||||
|  |             } else { | ||||||
|  |                 suffixCounter.incrementAmount(); | ||||||
|  |             } | ||||||
|  |  | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -32,7 +32,11 @@ public class SuffixCounter implements Comparable { | |||||||
|         amnout++; |         amnout++; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     public SuffixHeuristic getSuffixEvristic() { |     public void incrementAmount(Double wordFreq) { | ||||||
|  |         amnout += wordFreq; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     public SuffixHeuristic getSuffixHeuristic() { | ||||||
|         return suffixHeuristic; |         return suffixHeuristic; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -26,12 +26,12 @@ import java.io.InputStream; | |||||||
| import java.io.InputStreamReader; | import java.io.InputStreamReader; | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| public class SuffixEvristicsTest { | public class SuffixHeuristicTest { | ||||||
| 
 | 
 | ||||||
|     @Test |     @Test | ||||||
|     public void testShouldDefineCorretCononicalWordForm() throws IOException { |     public void testShouldDefineCorretCononicalWordForm() throws IOException { | ||||||
|         SuffixHeuristic suffixHeuristic = new SuffixHeuristic(); |         SuffixHeuristic suffixHeuristic = new SuffixHeuristic(); | ||||||
|         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-evristics-test-data.txt"); |         InputStream stream = this.getClass().getResourceAsStream("/org/apache/lucene/russian/morphology/analayzer/suffix-heuristic-test-data.txt"); | ||||||
|         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); |         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); | ||||||
|         String s = bufferedReader.readLine(); |         String s = bufferedReader.readLine(); | ||||||
|         while (s != null) { |         while (s != null) { | ||||||
| @@ -5,3 +5,4 @@ | |||||||
| поэтическая поэтический | поэтическая поэтический | ||||||
| произошло произойти | произошло произойти | ||||||
| test test | test test | ||||||
|  | ананасов ананас | ||||||
| @@ -9,7 +9,7 @@ | |||||||
| заход | заход | ||||||
| на | на | ||||||
| посадка | посадка | ||||||
| все | весь | ||||||
| нормальный | нормальный | ||||||
| быть | быть | ||||||
| рекомендовать | рекомендовать | ||||||
| @@ -25,7 +25,7 @@ | |||||||
| новолазаревский | новолазаревский | ||||||
| антарктида | антарктида | ||||||
| совершаться | совершаться | ||||||
| примерный | примерно | ||||||
| один | один | ||||||
| раз | раз | ||||||
| в | в | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Alexander.A.Kuznetsov
					Alexander.A.Kuznetsov