shhshn/bleu.py

## bleu.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# A BLEU calculator by Sho Hoshino (hoshino@nii.ac.jp)
# This script outputs BLEU-4 that should be identical to multi-bleu.perl,
# by sentence-level or document-level
#
# Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu
# BLEU: a method for automatic evaluation of machine translation, ACL 2002
#
# 2013/11/18 Added citation
# 2013/10/03 Initial Release

import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import math

def main():
    if len(sys.argv) < 2:
        print "Usage: %s ref [1:sentence-level BLEU] <hyp" % sys.argv[0]
        return
    sentencelv = True if len(sys.argv) >= 3 and sys.argv[2] == "1" else False
    m = 4 # default: BLEU-4
    tb = [[0, 0]] * m
    scores = [0] * m
    ref_len = 0
    hyp_len = 0
    try:
        file = open(sys.argv[1])
        for ref, hyp in zip(file, sys.stdin):
            if sentencelv:
                result(*iteration(tb, scores, ref_len, hyp_len, ref, hyp))
                continue
            tb, scores, ref_len, hyp_len = iteration(tb, scores, ref_len, hyp_len, ref, hyp)
    except Exception, e:
        sys.stderr.write("%s\n" % e)
        return
    finally:
        if file:
            file.close()
    if not sentencelv:
        result(True, scores, ref_len, hyp_len)

def iteration(tb, scores, ref_len, hyp_len, ref, hyp):
    m = len(scores)
    ref = unicode(ref).rstrip().split()
    hyp = unicode(hyp).rstrip().split()
    ref_len += len(ref)
    hyp_len += len(hyp)
    for n in xrange(0, m):
        nhyp = ngrams(hyp, n + 1)
        nref = ngrams(ref, n + 1)
        bottom = len(nhyp)
        top = 0
        for ngram in nhyp:
            if ngram not in nref:
                continue
            top += 1
            nref.remove(ngram)
        tb[n] = [tb[n][0] + top, tb[n][1] + bottom]
        scores[n] = 100.0 * tb[n][0] / tb[n][1] if tb[n][1] > 0 else 0
    return tb, scores, ref_len, hyp_len

def result(documentlv, scores, ref_len, hyp_len):
    m = len(scores)
    bleu = math.exp(sum([math.log(scores[i]) for i in xrange(0, m) if scores[i] > 0]) / m)
    for i in xrange(0, m):
        if scores[i] == 0:
            bleu = 0
    bp = math.exp(1 - (float(ref_len) / hyp_len)) if hyp_len != 0 and hyp_len < ref_len else 1
    if documentlv is not True:
        print "%.2f" % (bleu * bp)
        return
    print "BLEU = %.2f," % (bleu * bp), "/".join(["%.1f" % x for x in scores]),
    print "(BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)" % (bp, (float(hyp_len) / ref_len), hyp_len, ref_len),

def ngrams(input_list, n):
    return zip(*[input_list[i:] for i in range(n)])

if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	# -- coding: utf-8 --
	#
	# A BLEU calculator by Sho Hoshino (hoshino@nii.ac.jp)
	# This script outputs BLEU-4 that should be identical to multi-bleu.perl,
	# by sentence-level or document-level
	#
	# Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu
	# BLEU: a method for automatic evaluation of machine translation, ACL 2002
	#
	# 2013/11/18 Added citation
	# 2013/10/03 Initial Release

	import sys
	reload(sys)
	sys.setdefaultencoding('utf-8')
	import math

	def main():
	if len(sys.argv) < 2:
	print "Usage: %s ref [1:sentence-level BLEU] <hyp" % sys.argv[0]
	return
	sentencelv = True if len(sys.argv) >= 3 and sys.argv[2] == "1" else False
	m = 4 # default: BLEU-4
	tb = [[0, 0]] * m
	scores = [0] * m
	ref_len = 0
	hyp_len = 0
	try:
	file = open(sys.argv[1])
	for ref, hyp in zip(file, sys.stdin):
	if sentencelv:
	result(*iteration(tb, scores, ref_len, hyp_len, ref, hyp))
	continue
	tb, scores, ref_len, hyp_len = iteration(tb, scores, ref_len, hyp_len, ref, hyp)
	except Exception, e:
	sys.stderr.write("%s\n" % e)
	return
	finally:
	if file:
	file.close()
	if not sentencelv:
	result(True, scores, ref_len, hyp_len)

	def iteration(tb, scores, ref_len, hyp_len, ref, hyp):
	m = len(scores)
	ref = unicode(ref).rstrip().split()
	hyp = unicode(hyp).rstrip().split()
	ref_len += len(ref)
	hyp_len += len(hyp)
	for n in xrange(0, m):
	nhyp = ngrams(hyp, n + 1)
	nref = ngrams(ref, n + 1)
	bottom = len(nhyp)
	top = 0
	for ngram in nhyp:
	if ngram not in nref:
	continue
	top += 1
	nref.remove(ngram)
	tb[n] = [tb[n][0] + top, tb[n][1] + bottom]
	scores[n] = 100.0 * tb[n][0] / tb[n][1] if tb[n][1] > 0 else 0
	return tb, scores, ref_len, hyp_len

	def result(documentlv, scores, ref_len, hyp_len):
	m = len(scores)
	bleu = math.exp(sum([math.log(scores[i]) for i in xrange(0, m) if scores[i] > 0]) / m)
	for i in xrange(0, m):
	if scores[i] == 0:
	bleu = 0
	bp = math.exp(1 - (float(ref_len) / hyp_len)) if hyp_len != 0 and hyp_len < ref_len else 1
	if documentlv is not True:
	print "%.2f" % (bleu * bp)
	return
	print "BLEU = %.2f," % (bleu * bp), "/".join(["%.1f" % x for x in scores]),
	print "(BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)" % (bp, (float(hyp_len) / ref_len), hyp_len, ref_len),

	def ngrams(input_list, n):
	return zip(*[input_list[i:] for i in range(n)])

	if __name__ == "__main__":
	main()