pims/cluster.py

## cluster.py
#!/usr/bin/env python

"""
 code slightly adapted from http://www.amazon.com/Programming-Collective-Intelligence-Building-Applications/dp/0596529325

"""


import random
from math import sqrt


def pearson(v1,v2):
  # Simple sums
  sum1=sum(v1)
  sum2=sum(v2)

  # Sums of the squares
  sum1Sq=sum([pow(v,2) for v in v1])
  sum2Sq=sum([pow(v,2) for v in v2])

  # Sum of the products
  pSum=sum([v1[i]*v2[i] for i in range(len(v1))])

  # Calculate r (Pearson score)
  num=pSum-(sum1*sum2/len(v1))
  den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
  if den==0: return 0

  return 1.0-num/den

def kcluster(rows,distance=pearson,k=4):
  # Determine the minimum and maximum values for each point
  ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows])) for i in range(len(rows[0]))]

  # Create k randomly placed centroids
  clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] for i in range(len(rows[0]))] for j in range(k)]

  lastmatches=None
  for t in range(100):
    print 'Iteration %d' % t
    bestmatches=[[] for i in range(k)]

    # Find which centroid is the closest for each row
    for j in range(len(rows)):
      row=rows[j]
      bestmatch=0
      for i in range(k):
        d=distance(clusters[i],row)
        if d<distance(clusters[bestmatch],row): bestmatch=i
      bestmatches[bestmatch].append(j)

    # If the results are the same as last time, this is complete
    if bestmatches==lastmatches: break
    lastmatches=bestmatches

    # Move the centroids to the average of their members
    for i in range(k):
      avgs=[0.0]*len(rows[0])
      if len(bestmatches[i])>0:
        for rowid in bestmatches[i]:
          for m in range(len(rows[rowid])):
            avgs[m]+=rows[rowid][m]
        for j in range(len(avgs)):
          avgs[j]/=len(bestmatches[i])
        clusters[i]=avgs

  return bestmatches


def main():
    data = {
        'friend1': ['like1', 'like2', 'like3'],
        'friend2': ['like4'],
        'friend3': ['like1', 'like4'],
        'friend4': ['like1', 'like2', 'like3'],
    }

    possible_values = []
    for d in data.values():
        possible_values.extend(d)

    unique_possible_values = set(possible_values)

    rows = []
    keys = data.keys()
    keys.sort()
    for friend in keys:
        r = []
        for like in unique_possible_values:
            v = 1 if like in data[friend] else 0
            r.append(v)
        rows.append(r)

    print kcluster(rows=rows,k=2)
if __name__ == "__main__":
  main()
	#!/usr/bin/env python

	"""
	code slightly adapted from http://www.amazon.com/Programming-Collective-Intelligence-Building-Applications/dp/0596529325

	"""


	import random
	from math import sqrt



	def pearson(v1,v2):
	# Simple sums
	sum1=sum(v1)
	sum2=sum(v2)

	# Sums of the squares
	sum1Sq=sum([pow(v,2) for v in v1])
	sum2Sq=sum([pow(v,2) for v in v2])

	# Sum of the products
	pSum=sum([v1[i]*v2[i] for i in range(len(v1))])

	# Calculate r (Pearson score)
	num=pSum-(sum1*sum2/len(v1))
	den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
	if den==0: return 0

	return 1.0-num/den

	def kcluster(rows,distance=pearson,k=4):
	# Determine the minimum and maximum values for each point
	ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows])) for i in range(len(rows[0]))]

	# Create k randomly placed centroids
	clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] for i in range(len(rows[0]))] for j in range(k)]

	lastmatches=None
	for t in range(100):
	print 'Iteration %d' % t
	bestmatches=[[] for i in range(k)]

	# Find which centroid is the closest for each row
	for j in range(len(rows)):
	row=rows[j]
	bestmatch=0
	for i in range(k):
	d=distance(clusters[i],row)
	if d<distance(clusters[bestmatch],row): bestmatch=i
	bestmatches[bestmatch].append(j)

	# If the results are the same as last time, this is complete
	if bestmatches==lastmatches: break
	lastmatches=bestmatches

	# Move the centroids to the average of their members
	for i in range(k):
	avgs=[0.0]*len(rows[0])
	if len(bestmatches[i])>0:
	for rowid in bestmatches[i]:
	for m in range(len(rows[rowid])):
	avgs[m]+=rows[rowid][m]
	for j in range(len(avgs)):
	avgs[j]/=len(bestmatches[i])
	clusters[i]=avgs

	return bestmatches



	def main():
	data = {
	'friend1': ['like1', 'like2', 'like3'],
	'friend2': ['like4'],
	'friend3': ['like1', 'like4'],
	'friend4': ['like1', 'like2', 'like3'],
	}

	possible_values = []
	for d in data.values():
	possible_values.extend(d)

	unique_possible_values = set(possible_values)

	rows = []
	keys = data.keys()
	keys.sort()
	for friend in keys:
	r = []
	for like in unique_possible_values:
	v = 1 if like in data[friend] else 0
	r.append(v)
	rows.append(r)

	print kcluster(rows=rows,k=2)
	if __name__ == "__main__":
	main()