Created
July 14, 2015 10:17
-
-
Save alexeygrigorev/26c6ee078d33137c1fb6 to your computer and use it in GitHub Desktop.
Simple rule-based POS tagger for Russian (StanfordNLP & java)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package mlp.rus; | |
import java.util.Arrays; | |
import java.util.Comparator; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.Map.Entry; | |
import com.google.common.collect.ImmutableMap; | |
import com.google.common.collect.Lists; | |
import com.google.common.collect.Maps; | |
/** | |
* Simple rule-based POS tagger for Russian that looks only at the ending or the | |
* beginning of a word token to determine the part of speech. | |
* | |
* The initial implementation taken from http://habrahabr.ru/post/152389/ and | |
* rewritten to Java. | |
*/ | |
public class RuleBasedPosTagger { | |
/** | |
* Tags taken from Penn Treebank | |
* http://www.comp.leeds.ac.uk/amalgam/tagsets/upenn.html | |
*/ | |
public static enum PosTag { | |
ADJECTIVE("JJ"), | |
PARTICIPLE("VBG"), | |
VERB("VB"), | |
NOUN("NN"), | |
ADVERB("RB"), | |
NUMERAL("NUM"), | |
CONJUCTION("CC"), | |
PREPOSITION("IN"), | |
QUOTES_OPEN("``"), | |
QUOTES_CLOSE("''"), | |
COMMA(","), | |
DASH("--"), | |
END_OF_SENTENCE("."), | |
OTHER("X"); | |
private final String tag; | |
private PosTag(String tag) { | |
this.tag = tag; | |
} | |
public String getPennTag() { | |
return tag; | |
} | |
} | |
private final Map<PosTag, List<String>> rules = buildRulesMap(); | |
private final Map<String, PosTag> lookup = buildLookupMap(); | |
private static Map<PosTag, List<String>> buildRulesMap() { | |
ImmutableMap.Builder<PosTag, List<String>> builder = ImmutableMap.builder(); | |
builder.put(PosTag.ADJECTIVE, Arrays.asList("ее", "ие", "ые", "ое", "ими", "ыми", "ей", "ий", "ый", | |
"ой", "ем", "им", "ым", "ом", "его", "ого", "ему", "ому", "их", "ых", "ую", "юю", "ая", "яя", | |
"ою", "ею")); | |
builder.put(PosTag.PARTICIPLE, Arrays.asList("ивш", "ывш", "ующ", "ем", "нн", "вш", "ющ", "ущи", | |
"ющи", "ящий", "щих", "щие", "ляя")); | |
builder.put(PosTag.VERB, Arrays.asList("ила", "ыла", "ена", "ейте", "уйте", "ите", "или", "ыли", | |
"ей", "уй", "ил", "ыл", "им", "ым", "ен", "ило", "ыло", "ено", "ят", "ует", "уют", "ит", | |
"ыт", "ены", "ить", "ыть", "ишь", "ую", "ю", "ла", "на", "ете", "йте", "ли", "й", "л", "ем", | |
"н", "ло", "ет", "ют", "ны", "ть", "ешь", "нно")); | |
builder.put(PosTag.NOUN, Arrays.asList("а", "ев", "ов", "ье", "иями", "ями", "ами", "еи", "ии", "и", | |
"ией", "ей", "ой", "ий", "й", "иям", "ям", "ием", "ем", "ам", "ом", "о", "у", "ах", "иях", | |
"ях", "ы", "ь", "ию", "ью", "ю", "ия", "ья", "я", "ок", "мва", "яна", "ровать", "ег", "ги", | |
"га", "сть", "сти", "не")); | |
builder.put(PosTag.ADVERB, Arrays.asList("чно", "еко", "соко", "боко", "роко", "имо", "мно", "жно", | |
"жко", "ело", "тно", "льно", "здо", "зко", "шо", "хо", "но")); | |
builder.put(PosTag.NUMERAL, Arrays.asList("чуть", "много", "мало", "еро", "вое", "рое", "еро", "сти", | |
"одной", "двух", "рех", "еми", "яти", "ьми", "ати", "дного", "сто", "ста", "тысяча", | |
"тысячи", "две", "три", "одна", "умя", "тью", "мя", "тью", "мью", "тью", "одним")); | |
builder.put(PosTag.CONJUCTION, Arrays.asList("более", "менее", "очень", "крайне", "скоре", "некотор", | |
"кажд", "други", "котор", "когд", "однак", "если", "чтоб", "хот", "смотря", "как", "также", | |
"так", "зато", "что", "или", "потом", "эт", "тог", "тоже", "словно", "ежели", "кабы", "коли", | |
"ничем", "чем", "и")); | |
builder.put(PosTag.PREPOSITION, Arrays.asList("в", "на", "по", "из")); | |
return builder.build(); | |
} | |
private static Map<String, PosTag> buildLookupMap() { | |
ImmutableMap.Builder<String, PosTag> builder = ImmutableMap.builder(); | |
builder.put("и", PosTag.CONJUCTION); | |
builder.put("а", PosTag.CONJUCTION); | |
builder.put("но", PosTag.CONJUCTION); | |
builder.put("когда", PosTag.CONJUCTION); | |
builder.put("лишь", PosTag.CONJUCTION); | |
builder.put("пока", PosTag.CONJUCTION); | |
builder.put("едва", PosTag.CONJUCTION); | |
builder.put("зато", PosTag.CONJUCTION); | |
builder.put("либо", PosTag.CONJUCTION); | |
builder.put("или", PosTag.CONJUCTION); | |
builder.put("что", PosTag.CONJUCTION); | |
builder.put("чтобы", PosTag.CONJUCTION); | |
builder.put("как", PosTag.CONJUCTION); | |
builder.put("если", PosTag.CONJUCTION); | |
builder.put("ли", PosTag.CONJUCTION); | |
builder.put("без", PosTag.PREPOSITION); | |
builder.put("в", PosTag.PREPOSITION); | |
builder.put("до", PosTag.PREPOSITION); | |
builder.put("для", PosTag.PREPOSITION); | |
builder.put("за", PosTag.PREPOSITION); | |
builder.put("из", PosTag.PREPOSITION); | |
builder.put("к", PosTag.PREPOSITION); | |
builder.put("на", PosTag.PREPOSITION); | |
builder.put("над", PosTag.PREPOSITION); | |
builder.put("о", PosTag.PREPOSITION); | |
builder.put("об", PosTag.PREPOSITION); | |
builder.put("от", PosTag.PREPOSITION); | |
builder.put("под", PosTag.PREPOSITION); | |
builder.put("пред", PosTag.PREPOSITION); | |
builder.put("при", PosTag.PREPOSITION); | |
builder.put("про", PosTag.PREPOSITION); | |
builder.put("с", PosTag.PREPOSITION); | |
builder.put("у", PosTag.PREPOSITION); | |
builder.put("через", PosTag.PREPOSITION); | |
builder.put("между", PosTag.PREPOSITION); | |
builder.put("--", PosTag.DASH); | |
builder.put(",", PosTag.COMMA); | |
builder.put(".", PosTag.END_OF_SENTENCE); | |
builder.put("!", PosTag.END_OF_SENTENCE); | |
builder.put("?", PosTag.END_OF_SENTENCE); | |
builder.put("``", PosTag.QUOTES_OPEN); | |
builder.put("''", PosTag.QUOTES_CLOSE); | |
return builder.build(); | |
} | |
public PosTag posTag(String input) { | |
String token = input.toLowerCase();// TODO: set locale | |
if (lookup.containsKey(token)) { | |
return lookup.get(token); | |
} | |
Map<PosTag, Integer> resLens = Maps.newHashMap(); | |
for (Map.Entry<PosTag, List<String>> group : rules.entrySet()) { | |
PosTag groupKey = group.getKey(); | |
List<String> values = group.getValue(); | |
if (groupKey == PosTag.PARTICIPLE) { | |
applyParticipleRule(token, resLens, groupKey, values); | |
} else if (groupKey == PosTag.CONJUCTION) { | |
applyConjunctionRule(token, resLens, groupKey, values); | |
} else { | |
applyGeneralRule(token, resLens, groupKey, values); | |
} | |
} | |
List<Entry<PosTag, Integer>> result = sortResults(resLens); | |
if (result.isEmpty()) { | |
return PosTag.OTHER; | |
} else { | |
Entry<PosTag, Integer> tag = result.get(0); | |
return tag.getKey(); | |
} | |
} | |
private void applyParticipleRule(String token, Map<PosTag, Integer> resLens, PosTag groupKey, | |
List<String> values) { | |
int wordLength = token.length(); | |
for (String part : values) { | |
int lenPart = part.length(); | |
// participle with 40% or more of the token's length on the right | |
if (token.indexOf(part) >= (2 * wordLength / 5)) { | |
resLens.put(groupKey, lenPart); | |
} | |
} | |
} | |
private void applyConjunctionRule(String token, Map<PosTag, Integer> resLens, PosTag groupKey, | |
List<String> values) { | |
// conjunction, the beginning of word | |
for (String part : values) { | |
int lenPart = part.length(); | |
if (token.startsWith(part)) { | |
if (resLens.containsKey(groupKey)) { | |
if (resLens.get(groupKey) < lenPart) { | |
resLens.put(groupKey, lenPart); | |
} | |
} else { | |
resLens.put(groupKey, lenPart); | |
} | |
} | |
} | |
} | |
private void applyGeneralRule(String token, Map<PosTag, Integer> resLens, PosTag groupKey, | |
List<String> values) { | |
for (String part : values) { | |
int lenPart = part.length(); | |
if (token.endsWith(part)) { | |
if (resLens.containsKey(groupKey)) { | |
if (resLens.get(groupKey) < lenPart) { | |
resLens.put(groupKey, lenPart); | |
} | |
} else { | |
resLens.put(groupKey, lenPart); | |
} | |
} | |
if (token.equals(part)) { | |
resLens.put(groupKey, 99); | |
} | |
} | |
} | |
private List<Entry<PosTag, Integer>> sortResults(Map<PosTag, Integer> resLens) { | |
List<Entry<PosTag, Integer>> result = Lists.newArrayList(resLens.entrySet()); | |
result.sort(new Comparator<Map.Entry<PosTag, Integer>>() { | |
@Override | |
public int compare(Map.Entry<PosTag, Integer> o1, Map.Entry<PosTag, Integer> o2) { | |
Integer value1 = o1.getValue(); | |
Integer value2 = o2.getValue(); | |
if (value1.equals(value2)) { | |
int key1 = o1.getKey().ordinal(); | |
int key2 = o2.getKey().ordinal(); | |
return -Integer.compare(key1, key2); | |
} | |
return -value1.compareTo(value2); | |
} | |
}); | |
return result; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package mlp.rus; | |
import java.util.Collections; | |
import java.util.List; | |
import java.util.Set; | |
import mlp.rus.RuleBasedPosTagger.PosTag; | |
import edu.stanford.nlp.ling.CoreAnnotations; | |
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; | |
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; | |
import edu.stanford.nlp.ling.CoreLabel; | |
import edu.stanford.nlp.pipeline.Annotation; | |
import edu.stanford.nlp.pipeline.Annotator; | |
public class RusPosAnnotator implements Annotator { | |
private final RuleBasedPosTagger tagger = new RuleBasedPosTagger(); | |
@Override | |
public void annotate(Annotation annotation) { | |
List<CoreLabel> list = annotation.get(TokensAnnotation.class); | |
for (CoreLabel token : list) { | |
String textToken = token.get(TextAnnotation.class); | |
PosTag tag = tagger.posTag(textToken); | |
token.set(CoreAnnotations.PartOfSpeechAnnotation.class, tag.getPennTag()); | |
} | |
} | |
@Override | |
public Set<Requirement> requires() { | |
return Collections.singleton(TOKENIZE_REQUIREMENT); | |
} | |
@Override | |
public Set<Requirement> requirementsSatisfied() { | |
return Collections.singleton(POS_REQUIREMENT); | |
} | |
} |
Awesome! Thank for adapting!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Implementation adapted from here