Skip to content

Instantly share code, notes, and snippets.

@shhshn
Created July 31, 2022 04:55
Show Gist options
  • Save shhshn/cdfe74ba9aee00bbb5ac2fb8abc70f20 to your computer and use it in GitHub Desktop.
Save shhshn/cdfe74ba9aee00bbb5ac2fb8abc70f20 to your computer and use it in GitHub Desktop.
An implementation of IBM model 2 [Brown et al. 1993]: but somewhat incorrect and unfinished
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# An IBM Model 2 implementation by Sho Hoshino (hoshino@nii.ac.jp)
#
# Peter F Brown, Stephen A Della Pietra, Vincent J Della Pietra, Robert L Mercer
# The mathematics of statistical machine translation: parameter estimation,
# Computational Linguistics 19(2):263-311
#
# 2013/11/26 Initial Release
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import itertools, collections
def main():
#input = ["machine translation", u"機械 翻訳", "translation", u"翻訳"]
#input = ["das Haus", "the house", "das Buch", "the book", "ein Buch", "a book"]
input = ["I am a man", u"僕 は 男 です", "I am a girl", u"私 は 女 です", "I am a teacher", u"私 は 先生 です", "She is a teacher", u"彼女 は 先生 です", "He is a teacher", u"彼 は 先生 です"]
n = 100 # of iterations
t = {}
a = {}
ibm2_init(input, t, a, n)
for i in xrange(1, n+1):
ibm2_step(iter(input), t, a)
print a
for ff, ee in sorted(t.keys()):
print ff, ee, t[(ff,ee)]
def pos(l, e):
if e not in l:
return -1
return l.index(e)
def ibm2_init(input, t, a, n):
ibm1_init(iter(input), t)
for i in xrange(1, n+1):
ibm1_step(iter(input), t)
i = iter(input)
for f in i:
f = f.split(" ")
e = next(i).split(" ")
for ff, ee in itertools.product(f, e + ["NULL"]):
key = (pos(f, ff), pos(e, ee), len(e), len(f))
a[key] = 1.0 / (len(f) + 1)
def ibm2_step(i, t, a):
c = {}
c_a = {}
t_a = {}
for f in i:
f = f.split(" ")
e = next(i).split(" ")
for ff, ee in itertools.product(f, e + ["NULL"]):
diff = t[(ff,ee)] * a[(pos(f, ff), pos(e, ee), len(e), len(f))] / sum(t[(ff,ee)] * a[(pos(f, ff), pos(e, ee), len(e), len(f))] for ee in e+["NULL"])
if (ff,ee) not in c:
c[(ff,ee)] = 0
c[(ff,ee)] += diff
key = (pos(f, ff), pos(e, ee), len(e), len(f))
if key not in c_a:
c_a[key] = 0
c_a[key] += diff
if key[1:] not in t_a:
t_a[key[1:]] = 0
t_a[key[1:]] += diff
for ff, ee in t:
t[(ff,ee)] = c[(ff,ee)] / sum(c[(x,y)] for x,y in t if y == ee)
for key in a:
a[key] = c_a[key] / t_a[key[1:]]
def ibm1_init(i, t):
wordlist = set()
for f in i:
f = f.split(" ")
e = next(i).split(" ")
for ee in e:
wordlist.add(ee)
for ff, ee in itertools.product(f, e + ["NULL"]):
t[(ff,ee)] = 1.0
for pair in t:
t[pair] /= len(wordlist)
def ibm1_step(i, t):
c = {}
for f in i:
f = f.split(" ")
e = next(i).split(" ")
for ff, ee in itertools.product(f, e + ["NULL"]):
diff = t[(ff,ee)] / sum(t[(ff,ee)] for ee in e+["NULL"])
if (ff,ee) not in c:
c[(ff,ee)] = 0
c[(ff,ee)] += diff
for ff, ee in t:
t[(ff,ee)] = c[(ff,ee)] / sum(c[(x,y)] for x,y in t if y == ee)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment