Skip to content

Instantly share code, notes, and snippets.

@maxfriedrich
Created January 31, 2015 14:32
Show Gist options
  • Save maxfriedrich/a9a9721287fc28beb7d7 to your computer and use it in GitHub Desktop.
Save maxfriedrich/a9a9721287fc28beb7d7 to your computer and use it in GitHub Desktop.
POS Tagger
#!/usr/bin/env python
import argparse
import collections
import sys
def train_hmm(filename):
""" Trains a Hidden Markov Model with data from a text file. Returns a markov
dictionary (see `markov_dict`) and a dictionary of emission probabilities.
"""
def words_and_tags_from_file(filename):
""" Reads words and POS tags from a text file. The file must contain a word
and its POS tag in each line, seperated by '\t'. Returns two lists of same
length: one containing the words and one containing the tags.
"""
words, tags = [''], ['']
# starting with an empty word/tag so all words that begin sentences follow a
# '' word/tag and we can calculate starting probabilities with `markov['']`
with open(filename) as file:
for line in file:
if line != '\n':
split = line.strip().split('\t')
words.append(split[0])
tags.append(split[1])
else:
words.append('')
tags.append('')
return words, tags
def markov_dict(words):
""" Generates a Markov chain dictionary from a list of states. The dictionary
maps a state to a dictionary of its neighbors and their probabilities, e.g.
{'GWV': {'lecture': 0.5, 'tutorial': 0.3, ...}, ...}
"""
neighbors = collections.defaultdict(list)
for i in xrange(len(words) - 1):
word, neighbor = words[i], words[i+1]
neighbors[word].append(neighbor)
return {word: probabilities_dict(neighbors) for word, neighbors in neighbors.iteritems()}
def emission_probabilities(pairs):
""" Generates a dictionary of emission probabilities from a list of pairs
(state, emission) of HMM states and their emissions.
"""
emissions = collections.defaultdict(list)
for state, emission in pairs:
emissions[state].append(emission)
return {state: probabilities_dict(emissions) for state, emissions in emissions.iteritems()}
def probabilities_dict(list):
""" Helper function that generates a dictionary of probabilities from a list,
e.g. ['a', 'a', 'b', 'c'] -> {'a': 0.5, 'b': 0.25, 'c': 0.25}
"""
counts = collections.defaultdict(int)
for value in list:
counts[value] += 1
return {key: count * 1.0 / len(list) for key, count in counts.iteritems()}
words, tags = words_and_tags_from_file(filename)
hidden_markov = markov_dict(tags)
emissions = emission_probabilities(zip(tags, words))
return hidden_markov, emissions
def hmm_viterbi(sentence, hidden_markov, emissions):
""" Returns a list of states generated by the Viterbi algorithm. The list is the most
probable sequence of HMM states (POS tags) for the sentence (emissions).
"""
a = [{'': (1.0, None)}] # mapping a state to its most probable predecessor
for k, word in enumerate(sentence):
a.append(collections.defaultdict(float))
for state in hidden_markov.keys():
paths = []
for previous_state in a[k]:
transition_probability = hidden_markov[previous_state].get(state) or 0
emission_probability = emissions[state].get(word) or 0.0001
# handle unknown words: set emission probability to 0.0001 for all states.
paths.append((a[k][previous_state][0] * transition_probability * emission_probability, previous_state))
max_path = sorted(paths, key=lambda x: x[0], reverse=True)[0]
a[k+1][state] = max_path
# only save the probability and predecessor of the most probable path
# backtracking the path: inserting the predecessor at the beginning of the tag list.
end = '$.'
tags = [end]
k = len(sentence)
while k > 1:
predecessor = a[k][tags[0]][1]
tags.insert(0, predecessor)
k -= 1
return tags
def print_pos_tags(pairs):
""" Helper function that prints words and their tags seperated by a backslash
as required by this exercise.
"""
output = ''
for word, tag in pairs:
output += word + '\\' + tag + ' '
print output
if __name__ == '__main__':
parser = argparse.ArgumentParser(add_help=False, description='example: `python tagger.py Dell und Cisco zeigten sich hingegen davon relativ unbeeindruckt .`')
parser.add_argument('--filename', '-f', nargs=1, help='The training file. Default: hdt-1-10000-train.tags', default='hdt-1-10000-train.tags')
parser.add_argument('sentence', nargs='*', help='The sentence to be tagged.')
args = parser.parse_args()
if not args.sentence:
parser.print_help()
sys.exit(0)
hidden_markov, emissions = train_hmm(args.filename)
viterbi_tags = hmm_viterbi(args.sentence, hidden_markov, emissions)
print_pos_tags(zip(args.sentence, viterbi_tags))
sys.exit(0)
@maxfriedrich
Copy link
Author

Part of a homework solution.

Training file: hdt-tagged.tar.xz

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment