Skip to content

Instantly share code, notes, and snippets.

@ozancaglayan
Created July 5, 2017 11:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ozancaglayan/213f37a0bccbcc0b91e6e17b2ec261a9 to your computer and use it in GitHub Desktop.
Save ozancaglayan/213f37a0bccbcc0b91e6e17b2ec261a9 to your computer and use it in GitHub Desktop.
mteval-v13a.pl Python3 port
#!/usr/bin/env python3
import re
import math
import argparse
from collections import defaultdict
LOG_2 = math.log(2)
###################################################################################
# This is an exact reimplementation of mteval-v13a.pl without requiring SGM.
#
# Example:
# $ mteval-v13a -t model.beam12.detok -r newstest2017-tren-ref.en.txt -c
#
# BLEU(v13) = 16.09, 53.5/23.4/11.7/6.2 (ratio=0.927, hyp_len=62777, ref_len=67703)
###################################################################################
# Only single ref
def score_segment(tst_words, ref_words, ref_ngram_freqs, max_order):
# Create initial lists
match_cnt = [0 for i in range(max_order)]
tst_cnt = [0 for i in range(max_order)]
ref_cnt = [0 for i in range(max_order)]
tst_info = [0 for i in range(max_order)]
ref_info = [0 for i in range(max_order)]
ref_ngrams_max = {}
# Get the ngram counts for the test segment
tst_ngrams = words_to_ngrams(tst_words, max_order)
len_tst = len(tst_words)
for i in range(max_order):
tst_cnt[i] = (len_tst - i) if i < len_tst else 0
###########
# Reference
###########
ref_ngrams = words_to_ngrams(ref_words, max_order)
len_ref = len(ref_words)
for ngram_words, freq in ref_ngrams.items():
# Counts of ngrams for this sentence
ref_info[len(ngram_words) - 1] += ref_ngram_freqs[ngram_words]
# Update the maximum count of this ngram
# Shorter=>ref_ngrams_max[ngram_words] = max(ref_ngrams_max.get(ngram_words, -1), ref_ngrams[ngram_words])
if ngram_words in ref_ngrams_max:
ref_ngrams_max[ngram_words] = max(ref_ngrams_max[ngram_words], freq)
else:
ref_ngrams_max[ngram_words] = freq
# Update reference ngram counts
for i in range(max_order):
ref_cnt[i] = (len_ref - i) if i < len_ref else 0
for ngram_words, freq in tst_ngrams.items():
if ngram_words in ref_ngrams_max:
m = min(freq, ref_ngrams_max[ngram_words])
l = len(ngram_words) - 1
tst_info[l] += ref_ngram_freqs[ngram_words] * m
match_cnt[l] += m
return len_ref, match_cnt, tst_cnt, ref_cnt, tst_info, ref_info
def score_system(ref_segs, tst_segs, max_order):
ref_ngram_freqs = compute_ngram_info(ref_segs, max_order)
# 0-based indexing in contrast to perl version
cum_match = [0 for i in range(max_order)]
cum_tst_cnt = [0 for i in range(max_order)]
cum_ref_cnt = [0 for i in range(max_order)]
cum_tst_info = [0 for i in range(max_order)]
cum_ref_info = [0 for i in range(max_order)]
cum_ref_len = 0
# Score each segment and keep statistics
for tst, ref in zip(tst_segs, ref_segs):
ref_len, match_cnt, tst_cnt, ref_cnt, tst_info, ref_info = score_segment(tst, ref, ref_ngram_freqs, max_order)
# Sum ref length
cum_ref_len += ref_len
for i in range(max_order):
cum_match[i] += match_cnt[i]
cum_tst_cnt[i] += tst_cnt[i]
cum_ref_cnt[i] += ref_cnt[i]
cum_tst_info[i] += tst_info[i]
cum_ref_info[i] += ref_info[i]
# Compute length score
exp_len_score = math.exp(min(0, 1 - cum_ref_len / cum_tst_cnt[0])) \
if cum_tst_cnt[0] > 0 else 0
# For further length ratio computation
tst_vs_ref_ratio = (cum_tst_cnt[0], cum_ref_len, math.log(exp_len_score))
return bleu_score(cum_ref_len, cum_match, cum_tst_cnt, exp_len_score, max_order), tst_vs_ref_ratio
def read_file(fileobj, tokenizer, is_cased):
"""Read simple plain text file."""
sents = []
for line in fileobj:
sents.append(tokenizer(line, is_cased))
fileobj.close()
return sents
def words_to_ngrams(words, max_order):
"""Convert a sequence of words to an ngram count dict."""
d = defaultdict(int)
# Iterate over word indices as start pointers
for i in range(len(words)):
# Sliding windows
for j in range(min(max_order, len(words) - i)):
# Increment counter, keep keys as tuples
d[tuple(words[i: i+j+1])] += 1
return d
def compute_ngram_info(ref_segs, max_order):
tot_words = 0
# Segment-wise frequencies
ngram_count = defaultdict(int)
ngram_info = {}
for words in ref_segs:
tot_words += len(words)
# Get frequencies and add them to ngramcpunt
for key, value in words_to_ngrams(words, max_order).items():
ngram_count[key] += value
for ngram_words, freq in ngram_count.items():
if len(ngram_words) == 1:
# ngram is unigram => corpus frequency
denom = tot_words
else:
# n-gram is n-gram => n-gram frequency
denom = ngram_count[ngram_words[:-1]]
ngram_info[ngram_words] = -math.log(freq / denom) / LOG_2
return ngram_info
#################################################################################################
# Default method used to compute the BLEU score, using smoothing.
# The smoothing is computed by taking 1 / ( 2^k ), instead of 0,
# for each precision score whose matching n-gram count is null
# k is 1 for the first 'n' value for which the n-gram match count is null
# For example, if the text contains:
# - one 2-gram match
# - and (consequently) two 1-gram matches
# the n-gram count for each individual precision score would be:
# - n=1 => prec_count = 2 (two unigrams)
# - n=2 => prec_count = 1 (one bigram)
# - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
# - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
#################################################################################################
def bleu_score(ref_len, matching_ngrams, tst_ngrams, exp_len_score, max_order):
score = 0
iscore = 0
smooth = 1
ind_scores = []
cum_scores = []
for i in range(max_order):
if tst_ngrams[i] == 0:
iscore = 0
elif matching_ngrams[i] == 0:
smooth *= 2
iscore = math.log(1 / (smooth * tst_ngrams[i]))
else:
iscore = math.log(matching_ngrams[i] / tst_ngrams[i])
ind_scores.append(math.exp(iscore))
score += iscore
cum_scores.append(math.exp(score / (i+1)) * exp_len_score)
return ind_scores, cum_scores
#############################################################################
# function returning the shortest reference length
# takes as input:
# - currentLength : the current (shortest) reference length
# - referenceSentenceLength : the current reference sentence length
# - candidateSentenceLength : the current candidate sentence length (unused)
#############################################################################
def brevity_penalty_shortest(cur_len, ref_len, cand_len):
return ref_len if ref_len < cur_len else cur_len
####################################################################################
# function returning the closest reference length (to the candidate sentence length)
# takes as input:
# - currentLength: the current (closest) reference length.
# - candidateSentenceLength : the current reference sentence length
# - candidateSentenceLength : the current candidate sentence length
# when two reference sentences are at the same distance, it will return the shortest reference sentence length
# example of 4 iterations, given:
# - one candidate sentence containing 7 tokens
# - one reference translation containing 11 tokens
# - one reference translation containing 8 tokens
# - one reference translation containing 6 tokens
# - one reference translation containing 7 tokens
# the multiple invokations will return:
# - currentLength is set to 11 (outside of this function)
# - brevity_penalty_closest( 11, 8, 7 ) returns 8, since abs( 8 - 7 ) < abs( 11 - 7 )
# - brevity_penalty_closest( 8, 6, 7 ) returns 6, since abs( 8 - 7 ) == abs( 6 - 7 ) AND 6 < 8
# - brevity_penalty_closest( 7, 6, 7 ) returns 7, since abs( 7 - 7 ) < abs( 6 - 7 )
###################################################################################
def brevity_penalty_closest(cur_len, ref_len, cand_len):
result = cur_len
if abs(cand_len - ref_len) <= abs(cand_len - cur_len):
if abs(cand_len - ref_len) == abs(cand_len - cur_len):
if cur_len > ref_len:
result = ref_len
else:
result = ref_len
return result
def tokenizer(s, is_cased):
s = s.strip()
# language-independent part:
if '<skipped>' in s:
s = re.sub('<skipped>', '', s)
# Disabled: These should not happen with plain text
#s = re.sub('-\n', '')
#s = re.sub('\n', ' ')
#s = re.sub('&quot;', '"', s)
#s = re.sub('&amp;', '&', s)
#s = re.sub('&lt;', '<', s)
#s = re.sub('&gt;', '>', s)
# language-dependent part (assuming Western languages):
s = " %s " % s
if not is_cased:
s = s.lower()
# tokenize punctuation
s = re.sub('([\{-\~\[-\` -\&\(-\+\:-\@\/])', ' \\1 ', s)
# tokenize period and comma unless preceded by a digit
s = re.sub('([^0-9])([\.,])', '\\1 \\2 ', s)
# tokenize period and comma unless followed by a digit
s = re.sub('([\.,])([^0-9])', ' \\1 \\2', s)
# tokenize dash when preceded by a digit
if '-' in s:
s = re.sub('([0-9])(-)', '\\1 \\2 ', s)
# Strip multiple spaces
# Strip trailing and leading spaces
return re.sub('\s+', ' ', s).strip().split()
if __name__ == '__main__':
parser = argparse.ArgumentParser(prog='mteval-v13a')
parser.add_argument('-c', '--cased' , action='store_true', help="Preserve upper-case alphabetic characters.")
parser.add_argument('-n', '--max-order' , type=int, default=4, help="Max n-gram order.")
parser.add_argument('-p', '--brevity-penalty' , type=str, default='closest', help="Brevity penalty: closest or shortest.")
parser.add_argument('-d', '--detailed' , action='store_true', help="Show detailed report.")
parser.add_argument('-t', '--tst-file' , type=argparse.FileType('r'), required=True, help="Test sentences file.")
parser.add_argument('-r', '--ref-file' , type=argparse.FileType('r'), required=True, help="Reference sentences file.")
args = parser.parse_args()
# Set BP function
# (Only useful when multiple references)
#bp_func = brevity_penalty_closest
#if args.brevity_penalty == 'shortest':
# bp_func = brevity_penalty_shortest
# Read (detokenized) files and tokenize them
ref_segs = read_file(args.ref_file, tokenizer, args.cased)
tst_segs = read_file(args.tst_file, tokenizer, args.cased)
assert(len(ref_segs) == len(tst_segs))
(ind_scores, cum_scores), ratios = score_system(ref_segs, tst_segs, args.max_order)
print("BLEU(v13) = %.2f, %s (ratio=%.3f, hyp_len=%d, ref_len=%d)" % (
cum_scores[3]*100,
"/".join([("%.1f" % (s*100)) for s in ind_scores[:4]]),
(ratios[0]/ratios[1]), ratios[0], ratios[1]))
if args.detailed:
print("-------------------------------------")
print("Individual N-gram scoring")
for i in range(args.max_order):
print(' %d-gram' % (i+1), end='')
print()
for i in range(args.max_order):
print(' ------', end=' ')
print()
for i in range(args.max_order):
print(' %2.4f' % ind_scores[i], end='')
print()
print("Cumulative N-gram scoring")
for i in range(args.max_order):
print(' %d-gram' % (i+1), end='')
print()
for i in range(args.max_order):
print(' ------', end='')
print()
for i in range(args.max_order):
print(' %2.4f' % cum_scores[i], end='')
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment