Skip to content

Instantly share code, notes, and snippets.

@shhshn
Created July 31, 2022 04:53
Show Gist options
  • Save shhshn/696fbf1bc163e923ae4eb6e33a89d2ac to your computer and use it in GitHub Desktop.
Save shhshn/696fbf1bc163e923ae4eb6e33a89d2ac to your computer and use it in GitHub Desktop.
An implementation of IBM model 1 [Brown et al. 1993]: naive and unfinished
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# An IBM Model 1 implementation by Sho Hoshino (hoshino@nii.ac.jp)
#
# Peter F Brown, Stephen A Della Pietra, Vincent J Della Pietra, Robert L Mercer
# The mathematics of statistical machine translation: parameter estimation,
# Computational Linguistics 19(2):263-311
#
# 2013/11/26 Fixed not to use collections for speed up
# 2013/11/18 Added citation
# 2013/11/13 Initial Release
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import itertools
def main():
#input = ["machine translation", u"機械 翻訳", "translation", u"翻訳"]
#input = ["das Haus", "the house", "das Buch", "the book", "ein Buch", "a book"]
input = ["I am a man", u"僕 は 男 です", "I am a girl", u"私 は 女 です", "I am a teacher", u"私 は 先生 です", "She is a teacher", u"彼女 は 先生 です", "He is a teacher", u"彼 は 先生 です"]
t = {}
ibm1_init(iter(input), t)
for i in xrange(1, 100+1):
ibm1_step(iter(input), t)
for ff, ee in sorted(t.keys()):
print ff, ee, t[(ff,ee)]
def ibm1_init(i, t):
wordlist = set()
for f in i:
f = f.split(" ")
e = next(i).split(" ")
for ee in e:
wordlist.add(ee)
for ff, ee in itertools.product(f, e + ["NULL"]):
t[(ff,ee)] = 1.0
for pair in t:
t[pair] /= len(wordlist)
def ibm1_step(i, t):
c = {}
for f in i:
f = f.split(" ")
e = next(i).split(" ")
for ff, ee in itertools.product(f, e + ["NULL"]):
diff = t[(ff,ee)] / sum(t[(ff,ee)] for ee in e+["NULL"])
if (ff,ee) not in c:
c[(ff,ee)] = 0
c[(ff,ee)] += diff
for ff, ee in t:
t[(ff,ee)] = c[(ff,ee)] / sum(c[(x,y)] for x,y in t if y == ee)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment