Skip to content

Instantly share code, notes, and snippets.

@pims
Created January 3, 2012 01:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pims/1552988 to your computer and use it in GitHub Desktop.
Save pims/1552988 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""
code slightly adapted from http://www.amazon.com/Programming-Collective-Intelligence-Building-Applications/dp/0596529325
"""
import random
from math import sqrt
def pearson(v1,v2):
# Simple sums
sum1=sum(v1)
sum2=sum(v2)
# Sums of the squares
sum1Sq=sum([pow(v,2) for v in v1])
sum2Sq=sum([pow(v,2) for v in v2])
# Sum of the products
pSum=sum([v1[i]*v2[i] for i in range(len(v1))])
# Calculate r (Pearson score)
num=pSum-(sum1*sum2/len(v1))
den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
if den==0: return 0
return 1.0-num/den
def kcluster(rows,distance=pearson,k=4):
# Determine the minimum and maximum values for each point
ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows])) for i in range(len(rows[0]))]
# Create k randomly placed centroids
clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] for i in range(len(rows[0]))] for j in range(k)]
lastmatches=None
for t in range(100):
print 'Iteration %d' % t
bestmatches=[[] for i in range(k)]
# Find which centroid is the closest for each row
for j in range(len(rows)):
row=rows[j]
bestmatch=0
for i in range(k):
d=distance(clusters[i],row)
if d<distance(clusters[bestmatch],row): bestmatch=i
bestmatches[bestmatch].append(j)
# If the results are the same as last time, this is complete
if bestmatches==lastmatches: break
lastmatches=bestmatches
# Move the centroids to the average of their members
for i in range(k):
avgs=[0.0]*len(rows[0])
if len(bestmatches[i])>0:
for rowid in bestmatches[i]:
for m in range(len(rows[rowid])):
avgs[m]+=rows[rowid][m]
for j in range(len(avgs)):
avgs[j]/=len(bestmatches[i])
clusters[i]=avgs
return bestmatches
def main():
data = {
'friend1': ['like1', 'like2', 'like3'],
'friend2': ['like4'],
'friend3': ['like1', 'like4'],
'friend4': ['like1', 'like2', 'like3'],
}
possible_values = []
for d in data.values():
possible_values.extend(d)
unique_possible_values = set(possible_values)
rows = []
keys = data.keys()
keys.sort()
for friend in keys:
r = []
for like in unique_possible_values:
v = 1 if like in data[friend] else 0
r.append(v)
rows.append(r)
print kcluster(rows=rows,k=2)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment