Instantly share code, notes, and snippets.

# pims/cluster.py Created Jan 3, 2012

What would you like to do?
 #!/usr/bin/env python """ code slightly adapted from http://www.amazon.com/Programming-Collective-Intelligence-Building-Applications/dp/0596529325 """ import random from math import sqrt def pearson(v1,v2): # Simple sums sum1=sum(v1) sum2=sum(v2) # Sums of the squares sum1Sq=sum([pow(v,2) for v in v1]) sum2Sq=sum([pow(v,2) for v in v2]) # Sum of the products pSum=sum([v1[i]*v2[i] for i in range(len(v1))]) # Calculate r (Pearson score) num=pSum-(sum1*sum2/len(v1)) den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1))) if den==0: return 0 return 1.0-num/den def kcluster(rows,distance=pearson,k=4): # Determine the minimum and maximum values for each point ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows])) for i in range(len(rows[0]))] # Create k randomly placed centroids clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] for i in range(len(rows[0]))] for j in range(k)] lastmatches=None for t in range(100): print 'Iteration %d' % t bestmatches=[[] for i in range(k)] # Find which centroid is the closest for each row for j in range(len(rows)): row=rows[j] bestmatch=0 for i in range(k): d=distance(clusters[i],row) if d0: for rowid in bestmatches[i]: for m in range(len(rows[rowid])): avgs[m]+=rows[rowid][m] for j in range(len(avgs)): avgs[j]/=len(bestmatches[i]) clusters[i]=avgs return bestmatches def main(): data = { 'friend1': ['like1', 'like2', 'like3'], 'friend2': ['like4'], 'friend3': ['like1', 'like4'], 'friend4': ['like1', 'like2', 'like3'], } possible_values = [] for d in data.values(): possible_values.extend(d) unique_possible_values = set(possible_values) rows = [] keys = data.keys() keys.sort() for friend in keys: r = [] for like in unique_possible_values: v = 1 if like in data[friend] else 0 r.append(v) rows.append(r) print kcluster(rows=rows,k=2) if __name__ == "__main__": main()