Created
January 31, 2015 14:32
-
-
Save maxfriedrich/a9a9721287fc28beb7d7 to your computer and use it in GitHub Desktop.
POS Tagger
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import argparse | |
import collections | |
import sys | |
def train_hmm(filename): | |
""" Trains a Hidden Markov Model with data from a text file. Returns a markov | |
dictionary (see `markov_dict`) and a dictionary of emission probabilities. | |
""" | |
def words_and_tags_from_file(filename): | |
""" Reads words and POS tags from a text file. The file must contain a word | |
and its POS tag in each line, seperated by '\t'. Returns two lists of same | |
length: one containing the words and one containing the tags. | |
""" | |
words, tags = [''], [''] | |
# starting with an empty word/tag so all words that begin sentences follow a | |
# '' word/tag and we can calculate starting probabilities with `markov['']` | |
with open(filename) as file: | |
for line in file: | |
if line != '\n': | |
split = line.strip().split('\t') | |
words.append(split[0]) | |
tags.append(split[1]) | |
else: | |
words.append('') | |
tags.append('') | |
return words, tags | |
def markov_dict(words): | |
""" Generates a Markov chain dictionary from a list of states. The dictionary | |
maps a state to a dictionary of its neighbors and their probabilities, e.g. | |
{'GWV': {'lecture': 0.5, 'tutorial': 0.3, ...}, ...} | |
""" | |
neighbors = collections.defaultdict(list) | |
for i in xrange(len(words) - 1): | |
word, neighbor = words[i], words[i+1] | |
neighbors[word].append(neighbor) | |
return {word: probabilities_dict(neighbors) for word, neighbors in neighbors.iteritems()} | |
def emission_probabilities(pairs): | |
""" Generates a dictionary of emission probabilities from a list of pairs | |
(state, emission) of HMM states and their emissions. | |
""" | |
emissions = collections.defaultdict(list) | |
for state, emission in pairs: | |
emissions[state].append(emission) | |
return {state: probabilities_dict(emissions) for state, emissions in emissions.iteritems()} | |
def probabilities_dict(list): | |
""" Helper function that generates a dictionary of probabilities from a list, | |
e.g. ['a', 'a', 'b', 'c'] -> {'a': 0.5, 'b': 0.25, 'c': 0.25} | |
""" | |
counts = collections.defaultdict(int) | |
for value in list: | |
counts[value] += 1 | |
return {key: count * 1.0 / len(list) for key, count in counts.iteritems()} | |
words, tags = words_and_tags_from_file(filename) | |
hidden_markov = markov_dict(tags) | |
emissions = emission_probabilities(zip(tags, words)) | |
return hidden_markov, emissions | |
def hmm_viterbi(sentence, hidden_markov, emissions): | |
""" Returns a list of states generated by the Viterbi algorithm. The list is the most | |
probable sequence of HMM states (POS tags) for the sentence (emissions). | |
""" | |
a = [{'': (1.0, None)}] # mapping a state to its most probable predecessor | |
for k, word in enumerate(sentence): | |
a.append(collections.defaultdict(float)) | |
for state in hidden_markov.keys(): | |
paths = [] | |
for previous_state in a[k]: | |
transition_probability = hidden_markov[previous_state].get(state) or 0 | |
emission_probability = emissions[state].get(word) or 0.0001 | |
# handle unknown words: set emission probability to 0.0001 for all states. | |
paths.append((a[k][previous_state][0] * transition_probability * emission_probability, previous_state)) | |
max_path = sorted(paths, key=lambda x: x[0], reverse=True)[0] | |
a[k+1][state] = max_path | |
# only save the probability and predecessor of the most probable path | |
# backtracking the path: inserting the predecessor at the beginning of the tag list. | |
end = '$.' | |
tags = [end] | |
k = len(sentence) | |
while k > 1: | |
predecessor = a[k][tags[0]][1] | |
tags.insert(0, predecessor) | |
k -= 1 | |
return tags | |
def print_pos_tags(pairs): | |
""" Helper function that prints words and their tags seperated by a backslash | |
as required by this exercise. | |
""" | |
output = '' | |
for word, tag in pairs: | |
output += word + '\\' + tag + ' ' | |
print output | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(add_help=False, description='example: `python tagger.py Dell und Cisco zeigten sich hingegen davon relativ unbeeindruckt .`') | |
parser.add_argument('--filename', '-f', nargs=1, help='The training file. Default: hdt-1-10000-train.tags', default='hdt-1-10000-train.tags') | |
parser.add_argument('sentence', nargs='*', help='The sentence to be tagged.') | |
args = parser.parse_args() | |
if not args.sentence: | |
parser.print_help() | |
sys.exit(0) | |
hidden_markov, emissions = train_hmm(args.filename) | |
viterbi_tags = hmm_viterbi(args.sentence, hidden_markov, emissions) | |
print_pos_tags(zip(args.sentence, viterbi_tags)) | |
sys.exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Part of a homework solution.
Training file: hdt-tagged.tar.xz