Created
October 23, 2017 14:19
-
-
Save zyocum/cc26260b6513873a54ddb65f49a72c42 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Compute pair-wise cluster-level comparison metrics | |
These metrics are taken from: | |
https://www.cs.umd.edu/class/spring2012/cmsc828L/Papers/MenestrinaVLDB10.pdf""" | |
import random | |
from string import ascii_lowercase as alphabet | |
from math import sqrt, log | |
from itertools import combinations, product, chain | |
def partition(items, seed=None): | |
shuffled = [item for item in items] | |
if seed: | |
random.seed(seed) | |
random.shuffle(shuffled) | |
cell = [] | |
while shuffled: | |
while random.randint(0, 1): | |
if shuffled: | |
cell.append(shuffled.pop()) | |
if cell: | |
yield tuple(cell) | |
cell = [] | |
def universe(*clusters): | |
return set(chain(*chain(*clusters))) | |
def intersection(*args): | |
return set.intersection(*({tuple(x) for x in arg} for arg in args)) | |
def k(r, s): | |
"""Compute pair-wise cluster-level comparison metric K""" | |
author_purities = [] | |
cluster_purities = [] | |
n = float(len(universe(r, s))) | |
for x, y in product(r, s): | |
numerator = float(len(intersection(x, y)) ** 2) | |
author_purities.append(numerator / float(len(x))) | |
cluster_purities.append(numerator / float(len(y))) | |
average_author_purities = sum(author_purities) / n | |
average_cluster_purities = sum(cluster_purities) / n | |
return sqrt(average_author_purities * average_cluster_purities) | |
def entropy(r, n): | |
return -1 * sum(len(x) / n * log(len(x) / n, 2) for x in r) | |
def information(r, s, n): | |
i = 0.0 | |
for pair in product(r, s): | |
x, y = map(set, pair) | |
overlap = len(intersection(x, y)) | |
if overlap > 0: | |
i += (overlap / n) * log((overlap * n) / (len(x) * len(y)), 2) | |
return i | |
def vi(r, s): | |
"""Compute the pair-wise Variation of Information (VI)""" | |
n = float(len(universe(r, s))) | |
return entropy(r, n) + entropy(s, n) - (2 * information(r, s, n)) | |
def demo(r, s): | |
print('R:', *list(sorted(x) for x in sorted(r)), sep=', ') | |
print('S:', *list(sorted(x) for x in sorted(s)), sep=', ') | |
print('K(R,S):', k(r, s)) | |
print('VI(R,S):', vi(r, s)) | |
if __name__ == '__main__': | |
a, b, c, d = 'abcd' | |
r = (a, b), (c,), (d,) | |
s = (a, b), (c, d) | |
demo(r, s) | |
#partitions = [tuple(partition(alphabet, seed)) for seed in alphabet] | |
#for r, s in combinations(partitions, 2): | |
# demo(r, s) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment