Created
July 31, 2022 04:53
-
-
Save shhshn/696fbf1bc163e923ae4eb6e33a89d2ac to your computer and use it in GitHub Desktop.
An implementation of IBM model 1 [Brown et al. 1993]: naive and unfinished
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# An IBM Model 1 implementation by Sho Hoshino (hoshino@nii.ac.jp) | |
# | |
# Peter F Brown, Stephen A Della Pietra, Vincent J Della Pietra, Robert L Mercer | |
# The mathematics of statistical machine translation: parameter estimation, | |
# Computational Linguistics 19(2):263-311 | |
# | |
# 2013/11/26 Fixed not to use collections for speed up | |
# 2013/11/18 Added citation | |
# 2013/11/13 Initial Release | |
import sys | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
import itertools | |
def main(): | |
#input = ["machine translation", u"機械 翻訳", "translation", u"翻訳"] | |
#input = ["das Haus", "the house", "das Buch", "the book", "ein Buch", "a book"] | |
input = ["I am a man", u"僕 は 男 です", "I am a girl", u"私 は 女 です", "I am a teacher", u"私 は 先生 です", "She is a teacher", u"彼女 は 先生 です", "He is a teacher", u"彼 は 先生 です"] | |
t = {} | |
ibm1_init(iter(input), t) | |
for i in xrange(1, 100+1): | |
ibm1_step(iter(input), t) | |
for ff, ee in sorted(t.keys()): | |
print ff, ee, t[(ff,ee)] | |
def ibm1_init(i, t): | |
wordlist = set() | |
for f in i: | |
f = f.split(" ") | |
e = next(i).split(" ") | |
for ee in e: | |
wordlist.add(ee) | |
for ff, ee in itertools.product(f, e + ["NULL"]): | |
t[(ff,ee)] = 1.0 | |
for pair in t: | |
t[pair] /= len(wordlist) | |
def ibm1_step(i, t): | |
c = {} | |
for f in i: | |
f = f.split(" ") | |
e = next(i).split(" ") | |
for ff, ee in itertools.product(f, e + ["NULL"]): | |
diff = t[(ff,ee)] / sum(t[(ff,ee)] for ee in e+["NULL"]) | |
if (ff,ee) not in c: | |
c[(ff,ee)] = 0 | |
c[(ff,ee)] += diff | |
for ff, ee in t: | |
t[(ff,ee)] = c[(ff,ee)] / sum(c[(x,y)] for x,y in t if y == ee) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment