Skip to content

Instantly share code, notes, and snippets.

@shhshn
Created July 31, 2022 04:46
Show Gist options
  • Save shhshn/63471359abd262269cae0b10da769f24 to your computer and use it in GitHub Desktop.
Save shhshn/63471359abd262269cae0b10da769f24 to your computer and use it in GitHub Desktop.
An implementation of Kendall's tau [Kendall 1938]: used in the replication study of [Isozaki et al. WMT2010] but modified a bit from their paper
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Kendall's tau calculator by Sho Hoshino (hoshino@nii.ac.jp)
# Given a .A3.final file, this script outputs average and distribution of tau
#
# Hideki Isozaki, Katsuhito Sudoh, Hajime Tsukada, and Kevin Duh
# Head Finalization: a simple reordering rule for SOV languages, WMT 2010
#
# 2013/11/18 Added citation
# 2013/09/30 Fixed to use combination for tau
# 2013/09/09 Fixed to used stdin instead of files
# 2013/05/16
# Added a calculation of average tau
# Fixed the output format to be sorted by key
# Fixed a bug when len(list) is lower than 2
# 2013/01? Initial Release
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import itertools
result= {}
for key in xrange(-10, 10 + 1, 1):
result["%.1f" % (key / 10.0)] = 0
def main():
value = 0
count = 0
sys.stderr.write("calculating Kendall's tau ... ")
for i, line in enumerate(sys.stdin):
line = unicode(line).rstrip()
if i % 3 == 2:
r = process(line)
if r == None:
print i, line
raise Exception, "read error"
else:
value += r
count += 1
key = "%.1f" % r
if key == "-0.0":
key = "0.0"
result[key] += 1
sys.stderr.write("Done!\n")
for key in xrange(-10, 10 + 1, 1):
k = "%.1f" % (key / 10.0)
print k, result[k]
print "average tau = %f (%f/%d)" % (value / count, value, count)
def process(line):
null = line.find("NULL ({")
if null != -1:
end = line.find("})", null+7)
if end != -1:
line = line[:null] + line[end+2:]
numbers = ""
end = -2
while True:
start = line.find("({",end+2)
if start == -1:
break
end = line.find("})", start+2)
if end == -1:
break
numbers += line[start+2:end]
stack = ""
for char in numbers:
if stack != "" and char == " " and stack[-1] == " ":
continue
stack += char
numbers = [int(number) for number in stack.strip(" ").split(" ")]
if len(numbers) == 0:
return
return tau(numbers)
def tau(input):
# input = [2, 1, 3, 4] -> 0.66 ...
# input = [1, 4, 2, 5] -> 0.66 ...
n = len(input)
if n <= 1:
# should be -1.0 by the definition given in (Isozaki et al. 2010)
return -1.0
c = n * (n - 1)
inc = 0
# this is slower but clearer
for x, y in itertools.combinations(xrange(0, n), 2):
if input[x] < input[y]:
inc += 1
return inc * 4.0 / c - 1
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment