shhshn/ibmmodel2.py

## ibmmodel2.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# An IBM Model 2 implementation by Sho Hoshino (hoshino@nii.ac.jp)
#
# Peter F Brown, Stephen A Della Pietra, Vincent J Della Pietra, Robert L Mercer
# The mathematics of statistical machine translation: parameter estimation,
# Computational Linguistics 19(2):263-311
#
# 2013/11/26 Initial Release

import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import itertools, collections

def main():
	#input = ["machine translation", u"機械 翻訳", "translation", u"翻訳"]
	#input = ["das Haus", "the house", "das Buch", "the book", "ein Buch", "a book"]
	input = ["I am a man", u"僕 は 男 です", "I am a girl", u"私 は 女 です", "I am a teacher", u"私 は 先生 です", "She is a teacher", u"彼女 は 先生 です", "He is a teacher", u"彼 は 先生 です"]
	n = 100 # of iterations
	t = {}
	a = {}
	ibm2_init(input, t, a, n)
	for i in xrange(1, n+1):
		ibm2_step(iter(input), t, a)
	print a
	for ff, ee in sorted(t.keys()):
		print ff, ee, t[(ff,ee)]

def pos(l, e):
	if e not in l:
		return -1
	return l.index(e)

def ibm2_init(input, t, a, n):
	ibm1_init(iter(input), t)
	for i in xrange(1, n+1):
		ibm1_step(iter(input), t)
	i = iter(input)
	for f in i:
		f = f.split(" ")
		e = next(i).split(" ")
		for ff, ee in itertools.product(f, e + ["NULL"]):
			key = (pos(f, ff), pos(e, ee), len(e), len(f))
			a[key] = 1.0 / (len(f) + 1)

def ibm2_step(i, t, a):
	c = {}
	c_a = {}
	t_a = {}
	for f in i:
		f = f.split(" ")
		e = next(i).split(" ")
		for ff, ee in itertools.product(f, e + ["NULL"]):
			diff = t[(ff,ee)] * a[(pos(f, ff), pos(e, ee), len(e), len(f))] / sum(t[(ff,ee)] * a[(pos(f, ff), pos(e, ee), len(e), len(f))] for ee in e+["NULL"])
			if (ff,ee) not in c:
				c[(ff,ee)] = 0
			c[(ff,ee)] += diff
			key = (pos(f, ff), pos(e, ee), len(e), len(f))
			if key not in c_a:
				c_a[key] = 0
			c_a[key] += diff
			if key[1:] not in t_a:
				t_a[key[1:]] = 0
			t_a[key[1:]] += diff
	for ff, ee in t:
		t[(ff,ee)] = c[(ff,ee)] / sum(c[(x,y)] for x,y in t if y == ee)
	for key in a:
		a[key] = c_a[key] / t_a[key[1:]]

def ibm1_init(i, t):
	wordlist = set()
	for f in i:
		f = f.split(" ")
		e = next(i).split(" ")
		for ee in e:
			wordlist.add(ee)
		for ff, ee in itertools.product(f, e + ["NULL"]):
			t[(ff,ee)] = 1.0
	for pair in t:
		t[pair] /= len(wordlist)

def ibm1_step(i, t):
	c = {}
	for f in i:
		f = f.split(" ")
		e = next(i).split(" ")
		for ff, ee in itertools.product(f, e + ["NULL"]):
			diff = t[(ff,ee)] / sum(t[(ff,ee)] for ee in e+["NULL"])
			if (ff,ee) not in c:
				c[(ff,ee)] = 0
			c[(ff,ee)] += diff
	for ff, ee in t:
		t[(ff,ee)] = c[(ff,ee)] / sum(c[(x,y)] for x,y in t if y == ee)

if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	# -- coding: utf-8 --

	# An IBM Model 2 implementation by Sho Hoshino (hoshino@nii.ac.jp)
	#
	# Peter F Brown, Stephen A Della Pietra, Vincent J Della Pietra, Robert L Mercer
	# The mathematics of statistical machine translation: parameter estimation,
	# Computational Linguistics 19(2):263-311
	#
	# 2013/11/26 Initial Release

	import sys
	reload(sys)
	sys.setdefaultencoding('utf-8')
	import itertools, collections

	def main():
	#input = ["machine translation", u"機械翻訳", "translation", u"翻訳"]
	#input = ["das Haus", "the house", "das Buch", "the book", "ein Buch", "a book"]
	input = ["I am a man", u"僕は男です", "I am a girl", u"私は女です", "I am a teacher", u"私は先生です", "She is a teacher", u"彼女は先生です", "He is a teacher", u"彼は先生です"]
	n = 100 # of iterations
	t = {}
	a = {}
	ibm2_init(input, t, a, n)
	for i in xrange(1, n+1):
	ibm2_step(iter(input), t, a)
	print a
	for ff, ee in sorted(t.keys()):
	print ff, ee, t[(ff,ee)]

	def pos(l, e):
	if e not in l:
	return -1
	return l.index(e)

	def ibm2_init(input, t, a, n):
	ibm1_init(iter(input), t)
	for i in xrange(1, n+1):
	ibm1_step(iter(input), t)
	i = iter(input)
	for f in i:
	f = f.split(" ")
	e = next(i).split(" ")
	for ff, ee in itertools.product(f, e + ["NULL"]):
	key = (pos(f, ff), pos(e, ee), len(e), len(f))
	a[key] = 1.0 / (len(f) + 1)

	def ibm2_step(i, t, a):
	c = {}
	c_a = {}
	t_a = {}
	for f in i:
	f = f.split(" ")
	e = next(i).split(" ")
	for ff, ee in itertools.product(f, e + ["NULL"]):
	diff = t[(ff,ee)] * a[(pos(f, ff), pos(e, ee), len(e), len(f))] / sum(t[(ff,ee)] * a[(pos(f, ff), pos(e, ee), len(e), len(f))] for ee in e+["NULL"])
	if (ff,ee) not in c:
	c[(ff,ee)] = 0
	c[(ff,ee)] += diff
	key = (pos(f, ff), pos(e, ee), len(e), len(f))
	if key not in c_a:
	c_a[key] = 0
	c_a[key] += diff
	if key[1:] not in t_a:
	t_a[key[1:]] = 0
	t_a[key[1:]] += diff
	for ff, ee in t:
	t[(ff,ee)] = c[(ff,ee)] / sum(c[(x,y)] for x,y in t if y == ee)
	for key in a:
	a[key] = c_a[key] / t_a[key[1:]]

	def ibm1_init(i, t):
	wordlist = set()
	for f in i:
	f = f.split(" ")
	e = next(i).split(" ")
	for ee in e:
	wordlist.add(ee)
	for ff, ee in itertools.product(f, e + ["NULL"]):
	t[(ff,ee)] = 1.0
	for pair in t:
	t[pair] /= len(wordlist)

	def ibm1_step(i, t):
	c = {}
	for f in i:
	f = f.split(" ")
	e = next(i).split(" ")
	for ff, ee in itertools.product(f, e + ["NULL"]):
	diff = t[(ff,ee)] / sum(t[(ff,ee)] for ee in e+["NULL"])
	if (ff,ee) not in c:
	c[(ff,ee)] = 0
	c[(ff,ee)] += diff
	for ff, ee in t:
	t[(ff,ee)] = c[(ff,ee)] / sum(c[(x,y)] for x,y in t if y == ee)

	if __name__ == "__main__":
	main()