maxfriedrich/tagger.py

## tagger.py
#!/usr/bin/env python

import argparse
import collections
import sys


def train_hmm(filename):
	""" Trains a Hidden Markov Model with data from a text file. Returns a markov
		dictionary (see `markov_dict`) and a dictionary of emission probabilities.
	"""

	def words_and_tags_from_file(filename):
		""" Reads words and POS tags from a text file. The file must contain a word
			and its POS tag in each line, seperated by '\t'. Returns two lists of same
			length: one containing the words and one containing the tags.
		"""
		words, tags = [''], ['']
		# starting with an empty word/tag so all words that begin sentences follow a
		# '' word/tag and we can calculate starting probabilities with `markov['']`

		with open(filename) as file:
			for line in file:
				if line != '\n':
					split = line.strip().split('\t')
					words.append(split[0])
					tags.append(split[1])
				else:
					words.append('')
					tags.append('')
		return words, tags

	def markov_dict(words):
		""" Generates a Markov chain dictionary from a list of states. The dictionary
			maps a state to a dictionary of its neighbors and their probabilities, e.g.
			{'GWV': {'lecture': 0.5, 'tutorial': 0.3, ...}, ...}
		"""
		neighbors = collections.defaultdict(list)
		for i in xrange(len(words) - 1):
			word, neighbor = words[i], words[i+1]
			neighbors[word].append(neighbor)
		return {word: probabilities_dict(neighbors) for word, neighbors in neighbors.iteritems()}

	def emission_probabilities(pairs):
			""" Generates a dictionary of emission probabilities from a list of pairs
				(state, emission) of HMM states and their emissions.
			"""
			emissions = collections.defaultdict(list)
			for state, emission in pairs:
				emissions[state].append(emission)
			return {state: probabilities_dict(emissions) for state, emissions in emissions.iteritems()}

	def probabilities_dict(list):
		""" Helper function that generates a dictionary of probabilities from a list,
			e.g. ['a', 'a', 'b', 'c'] -> {'a': 0.5, 'b': 0.25, 'c': 0.25}
		"""
		counts = collections.defaultdict(int)
		for value in list:
			counts[value] += 1
		return {key: count * 1.0 / len(list) for key, count in counts.iteritems()}

	words, tags = words_and_tags_from_file(filename)
	hidden_markov = markov_dict(tags)
	emissions = emission_probabilities(zip(tags, words))
	return hidden_markov, emissions


def hmm_viterbi(sentence, hidden_markov, emissions):
	""" Returns a list of states generated by the Viterbi algorithm. The list is the most
		probable sequence of HMM states (POS tags) for the sentence (emissions).
	"""

	a = [{'': (1.0, None)}] # mapping a state to its most probable predecessor

	for k, word in enumerate(sentence):
		a.append(collections.defaultdict(float))
		for state in hidden_markov.keys():
			paths = []
			for previous_state in a[k]:
				transition_probability = hidden_markov[previous_state].get(state) or 0
				emission_probability = emissions[state].get(word) or 0.0001
				# handle unknown words: set emission probability to 0.0001 for all states.

				paths.append((a[k][previous_state][0] * transition_probability * emission_probability, previous_state))
			max_path = sorted(paths, key=lambda x: x[0], reverse=True)[0]
			a[k+1][state] = max_path
			# only save the probability and predecessor of the most probable path

	# backtracking the path: inserting the predecessor at the beginning of the tag list.
	end = '$.'
	tags = [end]

	k = len(sentence)
	while k > 1:
		predecessor = a[k][tags[0]][1]
		tags.insert(0, predecessor)
		k -= 1

	return tags


def print_pos_tags(pairs):
	""" Helper function that prints words and their tags seperated by a backslash
		as required by this exercise.
	"""
	output = ''
	for word, tag in pairs:
		output += word + '\\' + tag + ' '
	print output


if __name__ == '__main__':
	parser = argparse.ArgumentParser(add_help=False, description='example: `python tagger.py Dell und Cisco zeigten sich hingegen davon relativ unbeeindruckt .`')
	parser.add_argument('--filename', '-f', nargs=1, help='The training file. Default: hdt-1-10000-train.tags', default='hdt-1-10000-train.tags')
	parser.add_argument('sentence', nargs='*', help='The sentence to be tagged.')
	args = parser.parse_args()

	if not args.sentence:
		parser.print_help()
		sys.exit(0)

	hidden_markov, emissions = train_hmm(args.filename)
	viterbi_tags = hmm_viterbi(args.sentence, hidden_markov, emissions)
	print_pos_tags(zip(args.sentence, viterbi_tags))

	sys.exit(0)
	#!/usr/bin/env python

	import argparse
	import collections
	import sys


	def train_hmm(filename):
	""" Trains a Hidden Markov Model with data from a text file. Returns a markov
	dictionary (see `markov_dict`) and a dictionary of emission probabilities.
	"""

	def words_and_tags_from_file(filename):
	""" Reads words and POS tags from a text file. The file must contain a word
	and its POS tag in each line, seperated by '\t'. Returns two lists of same
	length: one containing the words and one containing the tags.
	"""
	words, tags = [''], ['']
	# starting with an empty word/tag so all words that begin sentences follow a
	# '' word/tag and we can calculate starting probabilities with `markov['']`

	with open(filename) as file:
	for line in file:
	if line != '\n':
	split = line.strip().split('\t')
	words.append(split[0])
	tags.append(split[1])
	else:
	words.append('')
	tags.append('')
	return words, tags

	def markov_dict(words):
	""" Generates a Markov chain dictionary from a list of states. The dictionary
	maps a state to a dictionary of its neighbors and their probabilities, e.g.
	{'GWV': {'lecture': 0.5, 'tutorial': 0.3, ...}, ...}
	"""
	neighbors = collections.defaultdict(list)
	for i in xrange(len(words) - 1):
	word, neighbor = words[i], words[i+1]
	neighbors[word].append(neighbor)
	return {word: probabilities_dict(neighbors) for word, neighbors in neighbors.iteritems()}

	def emission_probabilities(pairs):
	""" Generates a dictionary of emission probabilities from a list of pairs
	(state, emission) of HMM states and their emissions.
	"""
	emissions = collections.defaultdict(list)
	for state, emission in pairs:
	emissions[state].append(emission)
	return {state: probabilities_dict(emissions) for state, emissions in emissions.iteritems()}

	def probabilities_dict(list):
	""" Helper function that generates a dictionary of probabilities from a list,
	e.g. ['a', 'a', 'b', 'c'] -> {'a': 0.5, 'b': 0.25, 'c': 0.25}
	"""
	counts = collections.defaultdict(int)
	for value in list:
	counts[value] += 1
	return {key: count * 1.0 / len(list) for key, count in counts.iteritems()}

	words, tags = words_and_tags_from_file(filename)
	hidden_markov = markov_dict(tags)
	emissions = emission_probabilities(zip(tags, words))
	return hidden_markov, emissions


	def hmm_viterbi(sentence, hidden_markov, emissions):
	""" Returns a list of states generated by the Viterbi algorithm. The list is the most
	probable sequence of HMM states (POS tags) for the sentence (emissions).
	"""

	a = [{'': (1.0, None)}] # mapping a state to its most probable predecessor

	for k, word in enumerate(sentence):
	a.append(collections.defaultdict(float))
	for state in hidden_markov.keys():
	paths = []
	for previous_state in a[k]:
	transition_probability = hidden_markov[previous_state].get(state) or 0
	emission_probability = emissions[state].get(word) or 0.0001
	# handle unknown words: set emission probability to 0.0001 for all states.

	paths.append((a[k][previous_state][0] * transition_probability * emission_probability, previous_state))
	max_path = sorted(paths, key=lambda x: x[0], reverse=True)[0]
	a[k+1][state] = max_path
	# only save the probability and predecessor of the most probable path

	# backtracking the path: inserting the predecessor at the beginning of the tag list.
	end = '$.'
	tags = [end]

	k = len(sentence)
	while k > 1:
	predecessor = a[k][tags[0]][1]
	tags.insert(0, predecessor)
	k -= 1

	return tags


	def print_pos_tags(pairs):
	""" Helper function that prints words and their tags seperated by a backslash
	as required by this exercise.
	"""
	output = ''
	for word, tag in pairs:
	output += word + '\\' + tag + ' '
	print output


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(add_help=False, description='example: `python tagger.py Dell und Cisco zeigten sich hingegen davon relativ unbeeindruckt .`')
	parser.add_argument('--filename', '-f', nargs=1, help='The training file. Default: hdt-1-10000-train.tags', default='hdt-1-10000-train.tags')
	parser.add_argument('sentence', nargs='*', help='The sentence to be tagged.')
	args = parser.parse_args()

	if not args.sentence:
	parser.print_help()
	sys.exit(0)

	hidden_markov, emissions = train_hmm(args.filename)
	viterbi_tags = hmm_viterbi(args.sentence, hidden_markov, emissions)
	print_pos_tags(zip(args.sentence, viterbi_tags))

	sys.exit(0)