Adding context stats
This commit is contained in:
@ -1,23 +1,36 @@
|
||||
<?xml version="1.0"?>
|
||||
<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
|
||||
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>morphology</artifactId>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>morphology</artifactId>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>context</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
<name>context</name>
|
||||
<url>http://maven.apache.org</url>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>3.8.1</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<artifactId>context</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
<name>context</name>
|
||||
<url>http://maven.apache.org</url>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.8.2</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>russian</artifactId>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene.morphology</groupId>
|
||||
<artifactId>english</artifactId>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
|
@ -1,7 +1,52 @@
|
||||
/**
|
||||
* Copyright 2015 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology.context;
|
||||
|
||||
/**
|
||||
* Created by alexander on 16.06.15.
|
||||
*/
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
public class CalculateContextItem {
|
||||
|
||||
public List<ContextItem> createContextItems(String text) throws IOException {
|
||||
Analyzer statAnalyzer = new StatAnalyzer();
|
||||
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год? - и что я жду тебя, где вино".getBytes()), "UTF-8");
|
||||
|
||||
|
||||
// new RussianMorphology();
|
||||
|
||||
TokenStream tokenStream = statAnalyzer.tokenStream(null, reader);
|
||||
tokenStream.reset();
|
||||
|
||||
List<List<String>> listedLink = new LinkedList<>();
|
||||
while (tokenStream.incrementToken()) {
|
||||
CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
@ -1,11 +1,24 @@
|
||||
/**
|
||||
* Copyright 2015 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology.context;
|
||||
|
||||
/**
|
||||
* Created by alexander on 16.06.15.
|
||||
*/
|
||||
public class ContextItem {
|
||||
import java.util.Arrays;
|
||||
|
||||
public class ContextItem implements Comparable<ContextItem> {
|
||||
String[][] morphInfo;
|
||||
long count = 0;
|
||||
|
||||
public ContextItem(String[][] morphInfo) {
|
||||
this.morphInfo = morphInfo;
|
||||
@ -19,11 +32,49 @@ public class ContextItem {
|
||||
this.morphInfo = morphInfo;
|
||||
}
|
||||
|
||||
public long getCount() {
|
||||
return count;
|
||||
public int hashCode() {
|
||||
int h = 0;
|
||||
for (String[] m : morphInfo) {
|
||||
for (String s : m) {
|
||||
h = 31 * h + s.hashCode();
|
||||
}
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
public void setCount(long count) {
|
||||
this.count = count;
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
ContextItem that = (ContextItem) o;
|
||||
|
||||
if (that.morphInfo.length != this.morphInfo.length) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < morphInfo.length; i++) {
|
||||
if (!Arrays.equals(morphInfo[i], that.morphInfo[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compareTo(ContextItem o) {
|
||||
int i = o.morphInfo.length - morphInfo.length;
|
||||
if (i != 0) return i;
|
||||
for (int j = 0; j < morphInfo.length; j++) {
|
||||
i = o.morphInfo[j].length - morphInfo[j].length;
|
||||
if (i != 0) return i;
|
||||
for (int k = 0; k < morphInfo[j].length; k++) {
|
||||
i = morphInfo[j][k].compareTo(o.morphInfo[j][k]);
|
||||
if (i != 0) return i;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
@ -1,8 +1,20 @@
|
||||
/**
|
||||
* Copyright 2015 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology.context;
|
||||
|
||||
/**
|
||||
* Created by alexander on 16.06.15.
|
||||
*/
|
||||
public class ContextStats {
|
||||
String[] morphInfo;
|
||||
double prob;
|
||||
|
@ -1,8 +1,20 @@
|
||||
/**
|
||||
* Copyright 2015 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology.context;
|
||||
|
||||
/**
|
||||
* Created by alexander on 16.06.15.
|
||||
*/
|
||||
public class ProbClalucator {
|
||||
|
||||
|
||||
|
@ -1,3 +1,18 @@
|
||||
/**
|
||||
* Copyright 2015 Alexander Kuznetsov
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.morphology.context;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
@ -13,9 +28,6 @@ import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Created by alexander on 16.06.15.
|
||||
*/
|
||||
public class SimpleTokenizer extends Tokenizer {
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
@ -0,0 +1,34 @@
|
||||
package org.apache.lucene.morphology.context;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* Created by akuznetsov on 6/24/15.
|
||||
*/
|
||||
public class StatAnalyzer extends Analyzer {
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String s) {
|
||||
|
||||
SimpleTokenizer src = new SimpleTokenizer();
|
||||
TokenFilter filter = new StandardFilter(src);
|
||||
filter = new LowerCaseFilter(filter);
|
||||
|
||||
return new TokenStreamComponents(src, filter) {
|
||||
@Override
|
||||
protected void setReader(final Reader reader) throws IOException {
|
||||
super.setReader(reader);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,32 @@
|
||||
package org.apache.lucene.morphology.context;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
public class SimpleTokenizerTest {
|
||||
|
||||
@Test
|
||||
public void testSimpleTokenizer() throws IOException {
|
||||
Analyzer statAnalyzer = new StatAnalyzer();
|
||||
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год? - и что я жду тебя, где вино".getBytes()), "UTF-8");
|
||||
|
||||
TokenStream tokenStream = statAnalyzer.tokenStream(null, reader);
|
||||
tokenStream.reset();
|
||||
|
||||
boolean wordSeen = false;
|
||||
while (tokenStream.incrementToken()) {
|
||||
CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
|
||||
System.out.println(charTerm.toString());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Reference in New Issue
Block a user