shhshn/ibmmodel1.py

## ibmmodel1.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# An IBM Model 1 implementation by Sho Hoshino (hoshino@nii.ac.jp)
#
# Peter F Brown, Stephen A Della Pietra, Vincent J Della Pietra, Robert L Mercer
# The mathematics of statistical machine translation: parameter estimation,
# Computational Linguistics 19(2):263-311
#
# 2013/11/26 Fixed not to use collections for speed up
# 2013/11/18 Added citation
# 2013/11/13 Initial Release

import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import itertools

def main():
	#input = ["machine translation", u"機械 翻訳", "translation", u"翻訳"]
	#input = ["das Haus", "the house", "das Buch", "the book", "ein Buch", "a book"]
	input = ["I am a man", u"僕 は 男 です", "I am a girl", u"私 は 女 です", "I am a teacher", u"私 は 先生 です", "She is a teacher", u"彼女 は 先生 です", "He is a teacher", u"彼 は 先生 です"]
	t = {}
	ibm1_init(iter(input), t)
	for i in xrange(1, 100+1):
		ibm1_step(iter(input), t)
	for ff, ee in sorted(t.keys()):
		print ff, ee, t[(ff,ee)]

def ibm1_init(i, t):
	wordlist = set()
	for f in i:
		f = f.split(" ")
		e = next(i).split(" ")
		for ee in e:
			wordlist.add(ee)
		for ff, ee in itertools.product(f, e + ["NULL"]):
			t[(ff,ee)] = 1.0
	for pair in t:
		t[pair] /= len(wordlist)

def ibm1_step(i, t):
	c = {}
	for f in i:
		f = f.split(" ")
		e = next(i).split(" ")
		for ff, ee in itertools.product(f, e + ["NULL"]):
			diff = t[(ff,ee)] / sum(t[(ff,ee)] for ee in e+["NULL"])
			if (ff,ee) not in c:
				c[(ff,ee)] = 0
			c[(ff,ee)] += diff
	for ff, ee in t:
		t[(ff,ee)] = c[(ff,ee)] / sum(c[(x,y)] for x,y in t if y == ee)

if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	# -- coding: utf-8 --

	# An IBM Model 1 implementation by Sho Hoshino (hoshino@nii.ac.jp)
	#
	# Peter F Brown, Stephen A Della Pietra, Vincent J Della Pietra, Robert L Mercer
	# The mathematics of statistical machine translation: parameter estimation,
	# Computational Linguistics 19(2):263-311
	#
	# 2013/11/26 Fixed not to use collections for speed up
	# 2013/11/18 Added citation
	# 2013/11/13 Initial Release

	import sys
	reload(sys)
	sys.setdefaultencoding('utf-8')
	import itertools

	def main():
	#input = ["machine translation", u"機械翻訳", "translation", u"翻訳"]
	#input = ["das Haus", "the house", "das Buch", "the book", "ein Buch", "a book"]
	input = ["I am a man", u"僕は男です", "I am a girl", u"私は女です", "I am a teacher", u"私は先生です", "She is a teacher", u"彼女は先生です", "He is a teacher", u"彼は先生です"]
	t = {}
	ibm1_init(iter(input), t)
	for i in xrange(1, 100+1):
	ibm1_step(iter(input), t)
	for ff, ee in sorted(t.keys()):
	print ff, ee, t[(ff,ee)]

	def ibm1_init(i, t):
	wordlist = set()
	for f in i:
	f = f.split(" ")
	e = next(i).split(" ")
	for ee in e:
	wordlist.add(ee)
	for ff, ee in itertools.product(f, e + ["NULL"]):
	t[(ff,ee)] = 1.0
	for pair in t:
	t[pair] /= len(wordlist)

	def ibm1_step(i, t):
	c = {}
	for f in i:
	f = f.split(" ")
	e = next(i).split(" ")
	for ff, ee in itertools.product(f, e + ["NULL"]):
	diff = t[(ff,ee)] / sum(t[(ff,ee)] for ee in e+["NULL"])
	if (ff,ee) not in c:
	c[(ff,ee)] = 0
	c[(ff,ee)] += diff
	for ff, ee in t:
	t[(ff,ee)] = c[(ff,ee)] / sum(c[(x,y)] for x,y in t if y == ee)

	if __name__ == "__main__":
	main()