Skip to content

Instantly share code, notes, and snippets.

@chl
Created May 18, 2010 20:42
Show Gist options
  • Save chl/405507 to your computer and use it in GitHub Desktop.
Save chl/405507 to your computer and use it in GitHub Desktop.
# chl, 2010-05-18
# jaccard coefficient experiment, inspired by @datajunkie
from __future__ import with_statement, division
import collections
S = collections.defaultdict(set)
T = collections.defaultdict(set) # inverse
for a, b in (line.strip().split("\t") for line in open("input")):
S[a].add(b)
T[b].add(a)
def jc(a, b):
return len(S[a] & S[b]) / len(S[a] | S[b])
def jcs(a):
return ((x, jc(a, x)) for x in reduce(set.union, (T[x] for x in S[a]), set()))
with open("output", "w") as f:
for a in S.keys():
for b, v in jcs(a):
f.write("%s\t%s\t%s\n" % (a, b, v))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment