Skip to content

Instantly share code, notes, and snippets.

@zyocum
Created October 23, 2017 14:19
Show Gist options
  • Save zyocum/cc26260b6513873a54ddb65f49a72c42 to your computer and use it in GitHub Desktop.
Save zyocum/cc26260b6513873a54ddb65f49a72c42 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""Compute pair-wise cluster-level comparison metrics
These metrics are taken from:
https://www.cs.umd.edu/class/spring2012/cmsc828L/Papers/MenestrinaVLDB10.pdf"""
import random
from string import ascii_lowercase as alphabet
from math import sqrt, log
from itertools import combinations, product, chain
def partition(items, seed=None):
shuffled = [item for item in items]
if seed:
random.seed(seed)
random.shuffle(shuffled)
cell = []
while shuffled:
while random.randint(0, 1):
if shuffled:
cell.append(shuffled.pop())
if cell:
yield tuple(cell)
cell = []
def universe(*clusters):
return set(chain(*chain(*clusters)))
def intersection(*args):
return set.intersection(*({tuple(x) for x in arg} for arg in args))
def k(r, s):
"""Compute pair-wise cluster-level comparison metric K"""
author_purities = []
cluster_purities = []
n = float(len(universe(r, s)))
for x, y in product(r, s):
numerator = float(len(intersection(x, y)) ** 2)
author_purities.append(numerator / float(len(x)))
cluster_purities.append(numerator / float(len(y)))
average_author_purities = sum(author_purities) / n
average_cluster_purities = sum(cluster_purities) / n
return sqrt(average_author_purities * average_cluster_purities)
def entropy(r, n):
return -1 * sum(len(x) / n * log(len(x) / n, 2) for x in r)
def information(r, s, n):
i = 0.0
for pair in product(r, s):
x, y = map(set, pair)
overlap = len(intersection(x, y))
if overlap > 0:
i += (overlap / n) * log((overlap * n) / (len(x) * len(y)), 2)
return i
def vi(r, s):
"""Compute the pair-wise Variation of Information (VI)"""
n = float(len(universe(r, s)))
return entropy(r, n) + entropy(s, n) - (2 * information(r, s, n))
def demo(r, s):
print('R:', *list(sorted(x) for x in sorted(r)), sep=', ')
print('S:', *list(sorted(x) for x in sorted(s)), sep=', ')
print('K(R,S):', k(r, s))
print('VI(R,S):', vi(r, s))
if __name__ == '__main__':
a, b, c, d = 'abcd'
r = (a, b), (c,), (d,)
s = (a, b), (c, d)
demo(r, s)
#partitions = [tuple(partition(alphabet, seed)) for seed in alphabet]
#for r, s in combinations(partitions, 2):
# demo(r, s)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment