Created
July 31, 2022 04:55
-
-
Save shhshn/cdfe74ba9aee00bbb5ac2fb8abc70f20 to your computer and use it in GitHub Desktop.
An implementation of IBM model 2 [Brown et al. 1993]: but somewhat incorrect and unfinished
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# An IBM Model 2 implementation by Sho Hoshino (hoshino@nii.ac.jp) | |
# | |
# Peter F Brown, Stephen A Della Pietra, Vincent J Della Pietra, Robert L Mercer | |
# The mathematics of statistical machine translation: parameter estimation, | |
# Computational Linguistics 19(2):263-311 | |
# | |
# 2013/11/26 Initial Release | |
import sys | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
import itertools, collections | |
def main(): | |
#input = ["machine translation", u"機械 翻訳", "translation", u"翻訳"] | |
#input = ["das Haus", "the house", "das Buch", "the book", "ein Buch", "a book"] | |
input = ["I am a man", u"僕 は 男 です", "I am a girl", u"私 は 女 です", "I am a teacher", u"私 は 先生 です", "She is a teacher", u"彼女 は 先生 です", "He is a teacher", u"彼 は 先生 です"] | |
n = 100 # of iterations | |
t = {} | |
a = {} | |
ibm2_init(input, t, a, n) | |
for i in xrange(1, n+1): | |
ibm2_step(iter(input), t, a) | |
print a | |
for ff, ee in sorted(t.keys()): | |
print ff, ee, t[(ff,ee)] | |
def pos(l, e): | |
if e not in l: | |
return -1 | |
return l.index(e) | |
def ibm2_init(input, t, a, n): | |
ibm1_init(iter(input), t) | |
for i in xrange(1, n+1): | |
ibm1_step(iter(input), t) | |
i = iter(input) | |
for f in i: | |
f = f.split(" ") | |
e = next(i).split(" ") | |
for ff, ee in itertools.product(f, e + ["NULL"]): | |
key = (pos(f, ff), pos(e, ee), len(e), len(f)) | |
a[key] = 1.0 / (len(f) + 1) | |
def ibm2_step(i, t, a): | |
c = {} | |
c_a = {} | |
t_a = {} | |
for f in i: | |
f = f.split(" ") | |
e = next(i).split(" ") | |
for ff, ee in itertools.product(f, e + ["NULL"]): | |
diff = t[(ff,ee)] * a[(pos(f, ff), pos(e, ee), len(e), len(f))] / sum(t[(ff,ee)] * a[(pos(f, ff), pos(e, ee), len(e), len(f))] for ee in e+["NULL"]) | |
if (ff,ee) not in c: | |
c[(ff,ee)] = 0 | |
c[(ff,ee)] += diff | |
key = (pos(f, ff), pos(e, ee), len(e), len(f)) | |
if key not in c_a: | |
c_a[key] = 0 | |
c_a[key] += diff | |
if key[1:] not in t_a: | |
t_a[key[1:]] = 0 | |
t_a[key[1:]] += diff | |
for ff, ee in t: | |
t[(ff,ee)] = c[(ff,ee)] / sum(c[(x,y)] for x,y in t if y == ee) | |
for key in a: | |
a[key] = c_a[key] / t_a[key[1:]] | |
def ibm1_init(i, t): | |
wordlist = set() | |
for f in i: | |
f = f.split(" ") | |
e = next(i).split(" ") | |
for ee in e: | |
wordlist.add(ee) | |
for ff, ee in itertools.product(f, e + ["NULL"]): | |
t[(ff,ee)] = 1.0 | |
for pair in t: | |
t[pair] /= len(wordlist) | |
def ibm1_step(i, t): | |
c = {} | |
for f in i: | |
f = f.split(" ") | |
e = next(i).split(" ") | |
for ff, ee in itertools.product(f, e + ["NULL"]): | |
diff = t[(ff,ee)] / sum(t[(ff,ee)] for ee in e+["NULL"]) | |
if (ff,ee) not in c: | |
c[(ff,ee)] = 0 | |
c[(ff,ee)] += diff | |
for ff, ee in t: | |
t[(ff,ee)] = c[(ff,ee)] / sum(c[(x,y)] for x,y in t if y == ee) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment