Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@varvara-l
Last active August 16, 2018 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save varvara-l/2b848b8b0fbccb4772ba28aa12b61bcd to your computer and use it in GitHub Desktop.
Save varvara-l/2b848b8b0fbccb4772ba28aa12b61bcd to your computer and use it in GitHub Desktop.
Williams significance test for WMT-18 sentence-level submissions
from __future__ import division
from argparse import ArgumentParser
import sys
from scipy.stats import pearsonr, t
from math import sqrt
########################################################################################
# Williams significance test to define significance of Pearson correlation scores
# Implemented as described in [1]. The final significance level is modified
# with Bonferroni correction for multiple comparisons
#
# [1] Improving Evaluation of Machine Translation Quality Estimation, Y.Graham
########################################################################################
def parse_submission(in_file, score_id=2):
out = []
for line in open(in_file):
chunks = line.strip('\n').split('\t')
out.append(float(chunks[score_id]))
return out
def williams_test(sub1, sub2, ref):
r12 = pearsonr(sub1, sub2)[0]
r1 = pearsonr(ref, sub1)[0]
r2 = pearsonr(ref, sub2)[0]
n_samples = len(sub1)
num = (r1 - r2) * sqrt((n_samples-1) * (1 + r12))
K = 1 - r12**2 - r1**2 - r2**2 + 2 * r12 * r1 * r2
denom_sum1 = 2 * K * ((n_samples - 1)/(n_samples - 3))
denom_sum2 = (((r2 + r1)**2)/4)*((1 - r12)**3)
denom = sqrt(denom_sum1 + denom_sum2)
tval = num/denom
pval = t.sf(abs(tval), n_samples - 1)*2
return pval
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("submissions", nargs="+", help="submissions (wmt18 format)")
parser.add_argument("reference", help="reference (wmt18 format)")
args = parser.parse_args()
submissions = []
for sub in args.submissions:
submissions.append((sub, parse_submission(sub)))
ref = parse_submission(args.reference)
for idx in range(len(submissions)):
assert(len(ref) == len(submissions[idx][1]))
sub_scores = []
for n, sub in submissions:
corr = pearsonr(ref, sub)[0]
sub_scores.append(corr)
sorted_sub = sorted(zip(sub_scores, submissions), key=lambda(val, (n, s)): val, reverse=True)
print('Performance: ')
for val, (n, sub) in sorted_sub:
print('{}\t{}'.format(n, val))
n_subs = len(submissions)
comparisons = int((n_subs**2 - n_subs)/2)
new_alpha = 0.05/comparisons
print('{} comparisons\nalpha with Bonferroni corrections - {}'.format(comparisons, new_alpha))
print('----------------------------')
for i in range(len(sorted_sub)):
for j in range(i + 1, len(sorted_sub)):
pval = williams_test(sorted_sub[i][1][1], sorted_sub[j][1][1], ref)
if pval > new_alpha:
print('{}\t{}\t{}'.format(sorted_sub[i][1][0], sorted_sub[j][1][0], pval))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment