ozancaglayan/mtevalv13a.py

## mtevalv13a.py
#!/usr/bin/env python3
import re
import math
import argparse

from collections import defaultdict

LOG_2 = math.log(2)

###################################################################################
# This is an exact reimplementation of mteval-v13a.pl without requiring SGM.
#
# Example:
#  $ mteval-v13a -t model.beam12.detok -r newstest2017-tren-ref.en.txt -c
#
# BLEU(v13) = 16.09, 53.5/23.4/11.7/6.2 (ratio=0.927, hyp_len=62777, ref_len=67703)
###################################################################################


# Only single ref
def score_segment(tst_words, ref_words, ref_ngram_freqs, max_order):
    # Create initial lists
    match_cnt       = [0 for i in range(max_order)]
    tst_cnt         = [0 for i in range(max_order)]
    ref_cnt         = [0 for i in range(max_order)]
    tst_info        = [0 for i in range(max_order)]
    ref_info        = [0 for i in range(max_order)]

    ref_ngrams_max = {}

    # Get the ngram counts for the test segment
    tst_ngrams  = words_to_ngrams(tst_words, max_order)
    len_tst     = len(tst_words)
    for i in range(max_order):
        tst_cnt[i] = (len_tst - i) if i < len_tst else 0

    ###########
    # Reference
    ###########
    ref_ngrams  = words_to_ngrams(ref_words, max_order)
    len_ref     = len(ref_words)
    for ngram_words, freq in ref_ngrams.items():
        # Counts of ngrams for this sentence
        ref_info[len(ngram_words) - 1] += ref_ngram_freqs[ngram_words]

        # Update the maximum count of this ngram
        # Shorter=>ref_ngrams_max[ngram_words] = max(ref_ngrams_max.get(ngram_words, -1), ref_ngrams[ngram_words])
        if ngram_words in ref_ngrams_max:
            ref_ngrams_max[ngram_words] = max(ref_ngrams_max[ngram_words], freq)
        else:
            ref_ngrams_max[ngram_words] = freq

    # Update reference ngram counts
    for i in range(max_order):
        ref_cnt[i] = (len_ref - i) if i < len_ref else 0

    for ngram_words, freq in tst_ngrams.items():
        if ngram_words in ref_ngrams_max:
            m = min(freq, ref_ngrams_max[ngram_words])
            l = len(ngram_words) - 1
            tst_info[l]  += ref_ngram_freqs[ngram_words] * m
            match_cnt[l] += m

    return len_ref, match_cnt, tst_cnt, ref_cnt, tst_info, ref_info

def score_system(ref_segs, tst_segs, max_order):
    ref_ngram_freqs = compute_ngram_info(ref_segs, max_order)

    # 0-based indexing in contrast to perl version
    cum_match       = [0 for i in range(max_order)]
    cum_tst_cnt     = [0 for i in range(max_order)]
    cum_ref_cnt     = [0 for i in range(max_order)]
    cum_tst_info    = [0 for i in range(max_order)]
    cum_ref_info    = [0 for i in range(max_order)]
    cum_ref_len     = 0

    # Score each segment and keep statistics
    for tst, ref in zip(tst_segs, ref_segs):
        ref_len, match_cnt, tst_cnt, ref_cnt, tst_info, ref_info = score_segment(tst, ref, ref_ngram_freqs, max_order)

        # Sum ref length
        cum_ref_len += ref_len

        for i in range(max_order):
            cum_match[i]    += match_cnt[i]
            cum_tst_cnt[i]  += tst_cnt[i]
            cum_ref_cnt[i]  += ref_cnt[i]
            cum_tst_info[i] += tst_info[i]
            cum_ref_info[i] += ref_info[i]

    # Compute length score
    exp_len_score = math.exp(min(0, 1 - cum_ref_len / cum_tst_cnt[0])) \
            if cum_tst_cnt[0] > 0 else 0

    # For further length ratio computation
    tst_vs_ref_ratio = (cum_tst_cnt[0], cum_ref_len, math.log(exp_len_score))

    return bleu_score(cum_ref_len, cum_match, cum_tst_cnt, exp_len_score, max_order), tst_vs_ref_ratio

def read_file(fileobj, tokenizer, is_cased):
    """Read simple plain text file."""
    sents = []
    for line in fileobj:
        sents.append(tokenizer(line, is_cased))
    fileobj.close()
    return sents

def words_to_ngrams(words, max_order):
    """Convert a sequence of words to an ngram count dict."""
    d = defaultdict(int)

    # Iterate over word indices as start pointers
    for i in range(len(words)):
        # Sliding windows
        for j in range(min(max_order, len(words) - i)):
            # Increment counter, keep keys as tuples
            d[tuple(words[i: i+j+1])] += 1

    return d

def compute_ngram_info(ref_segs, max_order):
    tot_words = 0

    # Segment-wise frequencies
    ngram_count = defaultdict(int)

    ngram_info  = {}

    for words in ref_segs:
        tot_words += len(words)
        # Get frequencies and add them to ngramcpunt
        for key, value in words_to_ngrams(words, max_order).items():
            ngram_count[key] += value

    for ngram_words, freq in ngram_count.items():
        if len(ngram_words) == 1:
            # ngram is unigram => corpus frequency
            denom = tot_words
        else:
            # n-gram is n-gram => n-gram frequency
            denom = ngram_count[ngram_words[:-1]]

        ngram_info[ngram_words] = -math.log(freq / denom) / LOG_2
    return ngram_info

#################################################################################################
# Default method used to compute the BLEU score, using smoothing.
# The smoothing is computed by taking 1 / ( 2^k ), instead of 0,
# for each precision score whose matching n-gram count is null
# k is 1 for the first 'n' value for which the n-gram match count is null
# For example, if the text contains:
#   - one 2-gram match
#   - and (consequently) two 1-gram matches
# the n-gram count for each individual precision score would be:
#   - n=1  =>  prec_count = 2     (two unigrams)
#   - n=2  =>  prec_count = 1     (one bigram)
#   - n=3  =>  prec_count = 1/2   (no trigram,  taking 'smoothed' value of 1 / ( 2^k ), with k=1)
#   - n=4  =>  prec_count = 1/4   (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
#################################################################################################
def bleu_score(ref_len, matching_ngrams, tst_ngrams, exp_len_score, max_order):
    score = 0
    iscore = 0
    smooth = 1

    ind_scores = []
    cum_scores = []

    for i in range(max_order):
        if tst_ngrams[i] == 0:
            iscore = 0
        elif matching_ngrams[i] == 0:
            smooth *= 2
            iscore = math.log(1 / (smooth * tst_ngrams[i]))
        else:
            iscore = math.log(matching_ngrams[i] / tst_ngrams[i])

        ind_scores.append(math.exp(iscore))
        score += iscore
        cum_scores.append(math.exp(score / (i+1)) * exp_len_score)

    return ind_scores, cum_scores

#############################################################################
# function returning the shortest reference length
# takes as input:
#  - currentLength : the current (shortest) reference length
#  - referenceSentenceLength : the current reference sentence length
#  - candidateSentenceLength : the current candidate sentence length (unused)
#############################################################################
def brevity_penalty_shortest(cur_len, ref_len, cand_len):
    return ref_len if ref_len < cur_len else cur_len

####################################################################################
# function returning the closest reference length (to the candidate sentence length)
# takes as input:
#  - currentLength: the current (closest) reference length.
#  - candidateSentenceLength : the current reference sentence length
#  - candidateSentenceLength : the current candidate sentence length
# when two reference sentences are at the same distance, it will return the shortest reference sentence length
# example of 4 iterations, given:
#  - one candidate sentence containing 7 tokens
#  - one reference translation containing 11 tokens
#  - one reference translation containing 8 tokens
#  - one reference translation containing 6 tokens
#  - one reference translation containing 7 tokens
# the multiple invokations will return:
#  - currentLength is set to 11 (outside of this function)
#  - brevity_penalty_closest( 11, 8, 7 ) returns 8, since abs( 8 - 7 ) < abs( 11 - 7 )
#  - brevity_penalty_closest( 8, 6, 7 ) returns 6, since abs( 8 - 7 ) == abs( 6 - 7 ) AND 6 < 8
#  - brevity_penalty_closest( 7, 6, 7 ) returns 7, since abs( 7 - 7 ) < abs( 6 - 7 )
###################################################################################
def brevity_penalty_closest(cur_len, ref_len, cand_len):
    result = cur_len
    if abs(cand_len - ref_len) <= abs(cand_len - cur_len):
        if abs(cand_len - ref_len) == abs(cand_len - cur_len):
            if cur_len > ref_len:
                result = ref_len
        else:
            result = ref_len
    return result

def tokenizer(s, is_cased):
    s = s.strip()

    # language-independent part:
    if '<skipped>' in s:
        s = re.sub('<skipped>', '', s)
    # Disabled: These should not happen with plain text
    #s = re.sub('-\n', '')
    #s = re.sub('\n', ' ')
    #s = re.sub('&quot;', '"', s)
    #s = re.sub('&amp;', '&', s)
    #s = re.sub('&lt;', '<', s)
    #s = re.sub('&gt;', '>', s)

    # language-dependent part (assuming Western languages):
    s = " %s " % s
    if not is_cased:
        s = s.lower()

    # tokenize punctuation
    s = re.sub('([\{-\~\[-\` -\&\(-\+\:-\@\/])', ' \\1 ', s)

    # tokenize period and comma unless preceded by a digit
    s = re.sub('([^0-9])([\.,])', '\\1 \\2 ', s)

    # tokenize period and comma unless followed by a digit
    s = re.sub('([\.,])([^0-9])', ' \\1 \\2', s)

    # tokenize dash when preceded by a digit
    if '-' in s:
        s = re.sub('([0-9])(-)', '\\1 \\2 ', s)

    # Strip multiple spaces
    # Strip trailing and leading spaces
    return re.sub('\s+', ' ', s).strip().split()

if __name__ == '__main__':
    parser = argparse.ArgumentParser(prog='mteval-v13a')

    parser.add_argument('-c', '--cased'             , action='store_true',         help="Preserve upper-case alphabetic characters.")
    parser.add_argument('-n', '--max-order'         , type=int, default=4,         help="Max n-gram order.")
    parser.add_argument('-p', '--brevity-penalty'   , type=str, default='closest', help="Brevity penalty: closest or shortest.")
    parser.add_argument('-d', '--detailed'          , action='store_true',         help="Show detailed report.")

    parser.add_argument('-t', '--tst-file'  , type=argparse.FileType('r'), required=True, help="Test sentences file.")
    parser.add_argument('-r', '--ref-file'  , type=argparse.FileType('r'), required=True, help="Reference sentences file.")

    args = parser.parse_args()

    # Set BP function
    # (Only useful when multiple references)
    #bp_func = brevity_penalty_closest
    #if args.brevity_penalty == 'shortest':
    #    bp_func = brevity_penalty_shortest

    # Read (detokenized) files and tokenize them
    ref_segs = read_file(args.ref_file, tokenizer, args.cased)
    tst_segs = read_file(args.tst_file, tokenizer, args.cased)

    assert(len(ref_segs) == len(tst_segs))

    (ind_scores, cum_scores), ratios = score_system(ref_segs, tst_segs, args.max_order)

    print("BLEU(v13) = %.2f, %s (ratio=%.3f, hyp_len=%d, ref_len=%d)" % (
                cum_scores[3]*100,
                "/".join([("%.1f" % (s*100)) for s in ind_scores[:4]]),
                (ratios[0]/ratios[1]), ratios[0], ratios[1]))

    if args.detailed:
        print("-------------------------------------")
        print("Individual N-gram scoring")
        for i in range(args.max_order):
            print('  %d-gram' % (i+1), end='')
        print()
        for i in range(args.max_order):
            print('  ------', end=' ')
        print()
        for i in range(args.max_order):
            print('  %2.4f' % ind_scores[i], end='')
        print()

        print("Cumulative N-gram scoring")
        for i in range(args.max_order):
            print('  %d-gram' % (i+1), end='')
        print()
        for i in range(args.max_order):
            print('  ------', end='')
        print()
        for i in range(args.max_order):
            print('  %2.4f' % cum_scores[i], end='')
        print()
	#!/usr/bin/env python3
	import re
	import math
	import argparse

	from collections import defaultdict

	LOG_2 = math.log(2)

	###################################################################################
	# This is an exact reimplementation of mteval-v13a.pl without requiring SGM.
	#
	# Example:
	# $ mteval-v13a -t model.beam12.detok -r newstest2017-tren-ref.en.txt -c
	#
	# BLEU(v13) = 16.09, 53.5/23.4/11.7/6.2 (ratio=0.927, hyp_len=62777, ref_len=67703)
	###################################################################################


	# Only single ref
	def score_segment(tst_words, ref_words, ref_ngram_freqs, max_order):
	# Create initial lists
	match_cnt = [0 for i in range(max_order)]
	tst_cnt = [0 for i in range(max_order)]
	ref_cnt = [0 for i in range(max_order)]
	tst_info = [0 for i in range(max_order)]
	ref_info = [0 for i in range(max_order)]

	ref_ngrams_max = {}

	# Get the ngram counts for the test segment
	tst_ngrams = words_to_ngrams(tst_words, max_order)
	len_tst = len(tst_words)
	for i in range(max_order):
	tst_cnt[i] = (len_tst - i) if i < len_tst else 0

	###########
	# Reference
	###########
	ref_ngrams = words_to_ngrams(ref_words, max_order)
	len_ref = len(ref_words)
	for ngram_words, freq in ref_ngrams.items():
	# Counts of ngrams for this sentence
	ref_info[len(ngram_words) - 1] += ref_ngram_freqs[ngram_words]

	# Update the maximum count of this ngram
	# Shorter=>ref_ngrams_max[ngram_words] = max(ref_ngrams_max.get(ngram_words, -1), ref_ngrams[ngram_words])
	if ngram_words in ref_ngrams_max:
	ref_ngrams_max[ngram_words] = max(ref_ngrams_max[ngram_words], freq)
	else:
	ref_ngrams_max[ngram_words] = freq

	# Update reference ngram counts
	for i in range(max_order):
	ref_cnt[i] = (len_ref - i) if i < len_ref else 0

	for ngram_words, freq in tst_ngrams.items():
	if ngram_words in ref_ngrams_max:
	m = min(freq, ref_ngrams_max[ngram_words])
	l = len(ngram_words) - 1
	tst_info[l] += ref_ngram_freqs[ngram_words] * m
	match_cnt[l] += m

	return len_ref, match_cnt, tst_cnt, ref_cnt, tst_info, ref_info

	def score_system(ref_segs, tst_segs, max_order):
	ref_ngram_freqs = compute_ngram_info(ref_segs, max_order)

	# 0-based indexing in contrast to perl version
	cum_match = [0 for i in range(max_order)]
	cum_tst_cnt = [0 for i in range(max_order)]
	cum_ref_cnt = [0 for i in range(max_order)]
	cum_tst_info = [0 for i in range(max_order)]
	cum_ref_info = [0 for i in range(max_order)]
	cum_ref_len = 0

	# Score each segment and keep statistics
	for tst, ref in zip(tst_segs, ref_segs):
	ref_len, match_cnt, tst_cnt, ref_cnt, tst_info, ref_info = score_segment(tst, ref, ref_ngram_freqs, max_order)

	# Sum ref length
	cum_ref_len += ref_len

	for i in range(max_order):
	cum_match[i] += match_cnt[i]
	cum_tst_cnt[i] += tst_cnt[i]
	cum_ref_cnt[i] += ref_cnt[i]
	cum_tst_info[i] += tst_info[i]
	cum_ref_info[i] += ref_info[i]

	# Compute length score
	exp_len_score = math.exp(min(0, 1 - cum_ref_len / cum_tst_cnt[0])) \
	if cum_tst_cnt[0] > 0 else 0

	# For further length ratio computation
	tst_vs_ref_ratio = (cum_tst_cnt[0], cum_ref_len, math.log(exp_len_score))

	return bleu_score(cum_ref_len, cum_match, cum_tst_cnt, exp_len_score, max_order), tst_vs_ref_ratio

	def read_file(fileobj, tokenizer, is_cased):
	"""Read simple plain text file."""
	sents = []
	for line in fileobj:
	sents.append(tokenizer(line, is_cased))
	fileobj.close()
	return sents

	def words_to_ngrams(words, max_order):
	"""Convert a sequence of words to an ngram count dict."""
	d = defaultdict(int)

	# Iterate over word indices as start pointers
	for i in range(len(words)):
	# Sliding windows
	for j in range(min(max_order, len(words) - i)):
	# Increment counter, keep keys as tuples
	d[tuple(words[i: i+j+1])] += 1

	return d

	def compute_ngram_info(ref_segs, max_order):
	tot_words = 0

	# Segment-wise frequencies
	ngram_count = defaultdict(int)

	ngram_info = {}

	for words in ref_segs:
	tot_words += len(words)
	# Get frequencies and add them to ngramcpunt
	for key, value in words_to_ngrams(words, max_order).items():
	ngram_count[key] += value

	for ngram_words, freq in ngram_count.items():
	if len(ngram_words) == 1:
	# ngram is unigram => corpus frequency
	denom = tot_words
	else:
	# n-gram is n-gram => n-gram frequency
	denom = ngram_count[ngram_words[:-1]]

	ngram_info[ngram_words] = -math.log(freq / denom) / LOG_2
	return ngram_info

	#################################################################################################
	# Default method used to compute the BLEU score, using smoothing.
	# The smoothing is computed by taking 1 / ( 2^k ), instead of 0,
	# for each precision score whose matching n-gram count is null
	# k is 1 for the first 'n' value for which the n-gram match count is null
	# For example, if the text contains:
	# - one 2-gram match
	# - and (consequently) two 1-gram matches
	# the n-gram count for each individual precision score would be:
	# - n=1 => prec_count = 2 (two unigrams)
	# - n=2 => prec_count = 1 (one bigram)
	# - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
	# - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
	#################################################################################################
	def bleu_score(ref_len, matching_ngrams, tst_ngrams, exp_len_score, max_order):
	score = 0
	iscore = 0
	smooth = 1

	ind_scores = []
	cum_scores = []

	for i in range(max_order):
	if tst_ngrams[i] == 0:
	iscore = 0
	elif matching_ngrams[i] == 0:
	smooth *= 2
	iscore = math.log(1 / (smooth * tst_ngrams[i]))
	else:
	iscore = math.log(matching_ngrams[i] / tst_ngrams[i])

	ind_scores.append(math.exp(iscore))
	score += iscore
	cum_scores.append(math.exp(score / (i+1)) * exp_len_score)

	return ind_scores, cum_scores

	#############################################################################
	# function returning the shortest reference length
	# takes as input:
	# - currentLength : the current (shortest) reference length
	# - referenceSentenceLength : the current reference sentence length
	# - candidateSentenceLength : the current candidate sentence length (unused)
	#############################################################################
	def brevity_penalty_shortest(cur_len, ref_len, cand_len):
	return ref_len if ref_len < cur_len else cur_len

	####################################################################################
	# function returning the closest reference length (to the candidate sentence length)
	# takes as input:
	# - currentLength: the current (closest) reference length.
	# - candidateSentenceLength : the current reference sentence length
	# - candidateSentenceLength : the current candidate sentence length
	# when two reference sentences are at the same distance, it will return the shortest reference sentence length
	# example of 4 iterations, given:
	# - one candidate sentence containing 7 tokens
	# - one reference translation containing 11 tokens
	# - one reference translation containing 8 tokens
	# - one reference translation containing 6 tokens
	# - one reference translation containing 7 tokens
	# the multiple invokations will return:
	# - currentLength is set to 11 (outside of this function)
	# - brevity_penalty_closest( 11, 8, 7 ) returns 8, since abs( 8 - 7 ) < abs( 11 - 7 )
	# - brevity_penalty_closest( 8, 6, 7 ) returns 6, since abs( 8 - 7 ) == abs( 6 - 7 ) AND 6 < 8
	# - brevity_penalty_closest( 7, 6, 7 ) returns 7, since abs( 7 - 7 ) < abs( 6 - 7 )
	###################################################################################
	def brevity_penalty_closest(cur_len, ref_len, cand_len):
	result = cur_len
	if abs(cand_len - ref_len) <= abs(cand_len - cur_len):
	if abs(cand_len - ref_len) == abs(cand_len - cur_len):
	if cur_len > ref_len:
	result = ref_len
	else:
	result = ref_len
	return result

	def tokenizer(s, is_cased):
	s = s.strip()

	# language-independent part:
	if '<skipped>' in s:
	s = re.sub('<skipped>', '', s)
	# Disabled: These should not happen with plain text
	#s = re.sub('-\n', '')
	#s = re.sub('\n', ' ')
	#s = re.sub('"', '"', s)
	#s = re.sub('&', '&', s)
	#s = re.sub('<', '<', s)
	#s = re.sub('>', '>', s)

	# language-dependent part (assuming Western languages):
	s = " %s " % s
	if not is_cased:
	s = s.lower()

	# tokenize punctuation
	s = re.sub('([\{-\~\[-\` -\&\(-\+\:-\@\/])', ' \\1 ', s)

	# tokenize period and comma unless preceded by a digit
	s = re.sub('([^0-9])([\.,])', '\\1 \\2 ', s)

	# tokenize period and comma unless followed by a digit
	s = re.sub('([\.,])([^0-9])', ' \\1 \\2', s)

	# tokenize dash when preceded by a digit
	if '-' in s:
	s = re.sub('([0-9])(-)', '\\1 \\2 ', s)

	# Strip multiple spaces
	# Strip trailing and leading spaces
	return re.sub('\s+', ' ', s).strip().split()

	if __name__ == '__main__':
	parser = argparse.ArgumentParser(prog='mteval-v13a')

	parser.add_argument('-c', '--cased' , action='store_true', help="Preserve upper-case alphabetic characters.")
	parser.add_argument('-n', '--max-order' , type=int, default=4, help="Max n-gram order.")
	parser.add_argument('-p', '--brevity-penalty' , type=str, default='closest', help="Brevity penalty: closest or shortest.")
	parser.add_argument('-d', '--detailed' , action='store_true', help="Show detailed report.")

	parser.add_argument('-t', '--tst-file' , type=argparse.FileType('r'), required=True, help="Test sentences file.")
	parser.add_argument('-r', '--ref-file' , type=argparse.FileType('r'), required=True, help="Reference sentences file.")

	args = parser.parse_args()

	# Set BP function
	# (Only useful when multiple references)
	#bp_func = brevity_penalty_closest
	#if args.brevity_penalty == 'shortest':
	# bp_func = brevity_penalty_shortest

	# Read (detokenized) files and tokenize them
	ref_segs = read_file(args.ref_file, tokenizer, args.cased)
	tst_segs = read_file(args.tst_file, tokenizer, args.cased)

	assert(len(ref_segs) == len(tst_segs))

	(ind_scores, cum_scores), ratios = score_system(ref_segs, tst_segs, args.max_order)

	print("BLEU(v13) = %.2f, %s (ratio=%.3f, hyp_len=%d, ref_len=%d)" % (
	cum_scores[3]*100,
	"/".join([("%.1f" % (s*100)) for s in ind_scores[:4]]),
	(ratios[0]/ratios[1]), ratios[0], ratios[1]))

	if args.detailed:
	print("-------------------------------------")
	print("Individual N-gram scoring")
	for i in range(args.max_order):
	print(' %d-gram' % (i+1), end='')
	print()
	for i in range(args.max_order):
	print(' ------', end=' ')
	print()
	for i in range(args.max_order):
	print(' %2.4f' % ind_scores[i], end='')
	print()

	print("Cumulative N-gram scoring")
	for i in range(args.max_order):
	print(' %d-gram' % (i+1), end='')
	print()
	for i in range(args.max_order):
	print(' ------', end='')
	print()
	for i in range(args.max_order):
	print(' %2.4f' % cum_scores[i], end='')
	print()