Created
July 31, 2022 04:46
-
-
Save shhshn/63471359abd262269cae0b10da769f24 to your computer and use it in GitHub Desktop.
An implementation of Kendall's tau [Kendall 1938]: used in the replication study of [Isozaki et al. WMT2010] but modified a bit from their paper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# Kendall's tau calculator by Sho Hoshino (hoshino@nii.ac.jp) | |
# Given a .A3.final file, this script outputs average and distribution of tau | |
# | |
# Hideki Isozaki, Katsuhito Sudoh, Hajime Tsukada, and Kevin Duh | |
# Head Finalization: a simple reordering rule for SOV languages, WMT 2010 | |
# | |
# 2013/11/18 Added citation | |
# 2013/09/30 Fixed to use combination for tau | |
# 2013/09/09 Fixed to used stdin instead of files | |
# 2013/05/16 | |
# Added a calculation of average tau | |
# Fixed the output format to be sorted by key | |
# Fixed a bug when len(list) is lower than 2 | |
# 2013/01? Initial Release | |
import sys | |
reload(sys) | |
sys.setdefaultencoding("utf-8") | |
import itertools | |
result= {} | |
for key in xrange(-10, 10 + 1, 1): | |
result["%.1f" % (key / 10.0)] = 0 | |
def main(): | |
value = 0 | |
count = 0 | |
sys.stderr.write("calculating Kendall's tau ... ") | |
for i, line in enumerate(sys.stdin): | |
line = unicode(line).rstrip() | |
if i % 3 == 2: | |
r = process(line) | |
if r == None: | |
print i, line | |
raise Exception, "read error" | |
else: | |
value += r | |
count += 1 | |
key = "%.1f" % r | |
if key == "-0.0": | |
key = "0.0" | |
result[key] += 1 | |
sys.stderr.write("Done!\n") | |
for key in xrange(-10, 10 + 1, 1): | |
k = "%.1f" % (key / 10.0) | |
print k, result[k] | |
print "average tau = %f (%f/%d)" % (value / count, value, count) | |
def process(line): | |
null = line.find("NULL ({") | |
if null != -1: | |
end = line.find("})", null+7) | |
if end != -1: | |
line = line[:null] + line[end+2:] | |
numbers = "" | |
end = -2 | |
while True: | |
start = line.find("({",end+2) | |
if start == -1: | |
break | |
end = line.find("})", start+2) | |
if end == -1: | |
break | |
numbers += line[start+2:end] | |
stack = "" | |
for char in numbers: | |
if stack != "" and char == " " and stack[-1] == " ": | |
continue | |
stack += char | |
numbers = [int(number) for number in stack.strip(" ").split(" ")] | |
if len(numbers) == 0: | |
return | |
return tau(numbers) | |
def tau(input): | |
# input = [2, 1, 3, 4] -> 0.66 ... | |
# input = [1, 4, 2, 5] -> 0.66 ... | |
n = len(input) | |
if n <= 1: | |
# should be -1.0 by the definition given in (Isozaki et al. 2010) | |
return -1.0 | |
c = n * (n - 1) | |
inc = 0 | |
# this is slower but clearer | |
for x, y in itertools.combinations(xrange(0, n), 2): | |
if input[x] < input[y]: | |
inc += 1 | |
return inc * 4.0 / c - 1 | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment