Created
January 3, 2012 01:26
-
-
Save pims/1552988 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
code slightly adapted from http://www.amazon.com/Programming-Collective-Intelligence-Building-Applications/dp/0596529325 | |
""" | |
import random | |
from math import sqrt | |
def pearson(v1,v2): | |
# Simple sums | |
sum1=sum(v1) | |
sum2=sum(v2) | |
# Sums of the squares | |
sum1Sq=sum([pow(v,2) for v in v1]) | |
sum2Sq=sum([pow(v,2) for v in v2]) | |
# Sum of the products | |
pSum=sum([v1[i]*v2[i] for i in range(len(v1))]) | |
# Calculate r (Pearson score) | |
num=pSum-(sum1*sum2/len(v1)) | |
den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1))) | |
if den==0: return 0 | |
return 1.0-num/den | |
def kcluster(rows,distance=pearson,k=4): | |
# Determine the minimum and maximum values for each point | |
ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows])) for i in range(len(rows[0]))] | |
# Create k randomly placed centroids | |
clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] for i in range(len(rows[0]))] for j in range(k)] | |
lastmatches=None | |
for t in range(100): | |
print 'Iteration %d' % t | |
bestmatches=[[] for i in range(k)] | |
# Find which centroid is the closest for each row | |
for j in range(len(rows)): | |
row=rows[j] | |
bestmatch=0 | |
for i in range(k): | |
d=distance(clusters[i],row) | |
if d<distance(clusters[bestmatch],row): bestmatch=i | |
bestmatches[bestmatch].append(j) | |
# If the results are the same as last time, this is complete | |
if bestmatches==lastmatches: break | |
lastmatches=bestmatches | |
# Move the centroids to the average of their members | |
for i in range(k): | |
avgs=[0.0]*len(rows[0]) | |
if len(bestmatches[i])>0: | |
for rowid in bestmatches[i]: | |
for m in range(len(rows[rowid])): | |
avgs[m]+=rows[rowid][m] | |
for j in range(len(avgs)): | |
avgs[j]/=len(bestmatches[i]) | |
clusters[i]=avgs | |
return bestmatches | |
def main(): | |
data = { | |
'friend1': ['like1', 'like2', 'like3'], | |
'friend2': ['like4'], | |
'friend3': ['like1', 'like4'], | |
'friend4': ['like1', 'like2', 'like3'], | |
} | |
possible_values = [] | |
for d in data.values(): | |
possible_values.extend(d) | |
unique_possible_values = set(possible_values) | |
rows = [] | |
keys = data.keys() | |
keys.sort() | |
for friend in keys: | |
r = [] | |
for like in unique_possible_values: | |
v = 1 if like in data[friend] else 0 | |
r.append(v) | |
rows.append(r) | |
print kcluster(rows=rows,k=2) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment