leonaburime/kMeans.py

## kMeans.py
import random, pdb
from pprint import pprint

import pandas as pd, numpy as np
import sklearn
from sklearn.cluster import KMeans


labels = {0 : 'apple', 1 : 'banana'}
input = [
  # weight, color, # seeds
  [303, 3, 1],
  [370, 1, 2],
  [298, 3, 1],
  [277, 3, 1],
  [377, 4, 2],
  [299, 3, 1],
  [382, 1, 2],
  [374, 4, 6],
  [303, 4, 1],
  [309, 3, 1],
  [359, 1, 2],
  [366, 1, 4],
  [311, 3, 1],
  [302, 3, 1],
  [373, 4, 4],
  [305, 3, 1],

]

output = [
	#type
	["banana"],
	["apple"],
	["banana"],
	["banana"],
	["apple"],
	["banana"],
	["apple"],
	["apple"],
	["banana"],
	["banana"],
	["apple"],
	["apple"],
	["banana"],
	["banana"],
	["apple"],
	["banana"],
	["apple"],
]

sample_predict =  [371, 3, 6]


class kMeans:

  def __init__(self):
    self.data  = input
    self.k = 2
    self.max_iterations = 1000


  def solve(self, input=input, k = 2):

    #Not standardizing the variables now to get a better idea of the values used to run kMeans
    #input = sklearn.preprocessing.MinMaxScaler().fit_transform( np.array( input ).astype(float))

    #Sample answer 'centroids' would look like {0: [359, 1, 2], 1: [371, 3, 6]}
    centroids = self.createRandomCentroids(input, k)

    #Initializing book keeping vars and dummy 'oldCentroids'
    iterations = 0
    oldCentroids = { k : [None] for k in centroids.keys()}

    while not self.converged( oldCentroids, centroids, iterations ):
      #Solve old centroids for convergence test.
      oldCentroids = centroids


      clusters = self.findClosestCentroid(input, centroids , { k:[] for k in range(k)} )
      centroids = self.getNewCentroids( clusters)

      iterations += 1

    self.centroids = centroids


    print  "Cluster groupings:"
    pprint  ( clusters )

    print  "\nCentroids are located at \n"
    pprint ( centroids)
    self.sk_solve(input, k)


  def predict(self, input_variable=sample_predict):

    #Get the euclidean distance of each datapoint with k centroids
    dic = { k: np.linalg.norm( np.array(v)-np.array(input_variable) ) for k,v in self.centroids.items() }

    #Get the lowest index of min value put that datapoint in the appropriate cluster
    lowest_index = min( dic , key=dic.get)

    print "\nInput %s belongs in the cluster with centroid mean %s" % ( input_variable, \
          str( self.centroids[lowest_index]) )
    self.sk_predict( input_variable);


    return lowest_index


  #Evaluate the centroids and get the 'average' position to reevaluate all the clusters
  def getNewCentroids(self, clusters):

    return { k: np.mean(clusters[k], axis = 0) for k in clusters.keys() }

  def converged(self, oldCentroid, newCentroid, num_of_iterations ):

    #Lets make sure I dont go on forever
    if( num_of_iterations >= self.max_iterations ):

      print "Maximum iterations reached. Returning results...\n"
      return True

    elif all( [True if set( oldCentroid[k] ) == set( newCentroid[k] ) else False for k in newCentroid.keys()] ) :

      print "Old and new centroids have converged after %d iterations. Returning results...\n" % num_of_iterations
      return True


    return False


  #Create k random centroids - We dont care where at
  def createRandomCentroids(self, input, k=2):

    return {k:v for k,v in enumerate( random.sample(input, k) ) }


  def findClosestCentroid(self, input, centroids, clusters):

    for x in input:
      #Get the euclidean distance of each datapoint with k centroids
      dic = { k: np.linalg.norm( np.array(v)-np.array(x) ) for k,v in centroids.items() }

      #Get the lowest index of min value put that datapoint in the appropriate cluster
      lowest_index = min( dic , key=dic.get)
      clusters[lowest_index].append(x)

    return clusters

  def sk_solve(self, input, k=2):

    self.SK_Kmeans = KMeans(n_clusters=k)
    self.SK_Kmeans.fit(input)

    print "\nSKLearn KMeans: Centroids located at \n%s \n" % str( self.SK_Kmeans.cluster_centers_ )

  def sk_predict(self, input_variable):
    lowest_index = self.SK_Kmeans.predict(input_variable)
    print "\nSKLearn KMeans : Input %s belongs in the cluster with centroid mean %s\n" % ( input_variable, \
          str( self.SK_Kmeans.cluster_centers_[lowest_index][0]) )


#Example commands for tutorials
#import numpy
#numpy.linalg.norm( numpy.array([300, 2,3])-numpy.array([400, 1,6]) )

#Make sure our inputs are floats instead of ints and standardized between 0 and 1
#input = sklearn.preprocessing.MinMaxScaler().fit_transform( np.array( input ).astype(float))

#To run program follow these simple steps
#Download the module into the working directory and then type from the shell...

"""
#1. import kMeans
#2. kmeans = kMeans.kMeans()
#3. kmeans.solve()

and you should get an output that looks like this


Old and new centroids have converged after 3 iterations. Returning results...

Cluster groupings:
{0: [[303, 3, 1],
     [298, 3, 1],
     [277, 3, 1],
     [299, 3, 1],
     [303, 4, 1],
     [309, 3, 1],
     [311, 3, 1],
     [302, 3, 1],
     [305, 3, 1]],
 1: [[370, 1, 2],
     [377, 4, 2],
     [382, 1, 2],
     [374, 4, 6],
     [359, 1, 2],
     [366, 1, 4],
     [373, 4, 4]]}

Centroids are located at

{0: array([ 300.77777778,    3.11111111,    1.        ]),
 1: array([ 371.57142857,    2.28571429,    3.14285714])}

SKLearn KMeans: Centroids located at
[[ 371.57142857    2.28571429    3.14285714]
 [ 300.77777778    3.11111111    1.        ]]

"""
	import random, pdb
	from pprint import pprint

	import pandas as pd, numpy as np
	import sklearn
	from sklearn.cluster import KMeans


	labels = {0 : 'apple', 1 : 'banana'}
	input = [
	# weight, color, # seeds
	[303, 3, 1],
	[370, 1, 2],
	[298, 3, 1],
	[277, 3, 1],
	[377, 4, 2],
	[299, 3, 1],
	[382, 1, 2],
	[374, 4, 6],
	[303, 4, 1],
	[309, 3, 1],
	[359, 1, 2],
	[366, 1, 4],
	[311, 3, 1],
	[302, 3, 1],
	[373, 4, 4],
	[305, 3, 1],

	]

	output = [
	#type
	["banana"],
	["apple"],
	["banana"],
	["banana"],
	["apple"],
	["banana"],
	["apple"],
	["apple"],
	["banana"],
	["banana"],
	["apple"],
	["apple"],
	["banana"],
	["banana"],
	["apple"],
	["banana"],
	["apple"],
	]

	sample_predict = [371, 3, 6]


	class kMeans:

	def __init__(self):
	self.data = input
	self.k = 2
	self.max_iterations = 1000



	def solve(self, input=input, k = 2):

	#Not standardizing the variables now to get a better idea of the values used to run kMeans
	#input = sklearn.preprocessing.MinMaxScaler().fit_transform( np.array( input ).astype(float))

	#Sample answer 'centroids' would look like {0: [359, 1, 2], 1: [371, 3, 6]}
	centroids = self.createRandomCentroids(input, k)

	#Initializing book keeping vars and dummy 'oldCentroids'
	iterations = 0
	oldCentroids = { k : [None] for k in centroids.keys()}

	while not self.converged( oldCentroids, centroids, iterations ):
	#Solve old centroids for convergence test.
	oldCentroids = centroids


	clusters = self.findClosestCentroid(input, centroids , { k:[] for k in range(k)} )
	centroids = self.getNewCentroids( clusters)

	iterations += 1

	self.centroids = centroids


	print "Cluster groupings:"
	pprint ( clusters )

	print "\nCentroids are located at \n"
	pprint ( centroids)
	self.sk_solve(input, k)



	def predict(self, input_variable=sample_predict):

	#Get the euclidean distance of each datapoint with k centroids
	dic = { k: np.linalg.norm( np.array(v)-np.array(input_variable) ) for k,v in self.centroids.items() }

	#Get the lowest index of min value put that datapoint in the appropriate cluster
	lowest_index = min( dic , key=dic.get)

	print "\nInput %s belongs in the cluster with centroid mean %s" % ( input_variable, \
	str( self.centroids[lowest_index]) )
	self.sk_predict( input_variable);


	return lowest_index


	#Evaluate the centroids and get the 'average' position to reevaluate all the clusters
	def getNewCentroids(self, clusters):

	return { k: np.mean(clusters[k], axis = 0) for k in clusters.keys() }

	def converged(self, oldCentroid, newCentroid, num_of_iterations ):

	#Lets make sure I dont go on forever
	if( num_of_iterations >= self.max_iterations ):

	print "Maximum iterations reached. Returning results...\n"
	return True

	elif all( [True if set( oldCentroid[k] ) == set( newCentroid[k] ) else False for k in newCentroid.keys()] ) :

	print "Old and new centroids have converged after %d iterations. Returning results...\n" % num_of_iterations
	return True


	return False


	#Create k random centroids - We dont care where at
	def createRandomCentroids(self, input, k=2):

	return {k:v for k,v in enumerate( random.sample(input, k) ) }


	def findClosestCentroid(self, input, centroids, clusters):

	for x in input:
	#Get the euclidean distance of each datapoint with k centroids
	dic = { k: np.linalg.norm( np.array(v)-np.array(x) ) for k,v in centroids.items() }

	#Get the lowest index of min value put that datapoint in the appropriate cluster
	lowest_index = min( dic , key=dic.get)
	clusters[lowest_index].append(x)

	return clusters

	def sk_solve(self, input, k=2):

	self.SK_Kmeans = KMeans(n_clusters=k)
	self.SK_Kmeans.fit(input)

	print "\nSKLearn KMeans: Centroids located at \n%s \n" % str( self.SK_Kmeans.cluster_centers_ )

	def sk_predict(self, input_variable):
	lowest_index = self.SK_Kmeans.predict(input_variable)
	print "\nSKLearn KMeans : Input %s belongs in the cluster with centroid mean %s\n" % ( input_variable, \
	str( self.SK_Kmeans.cluster_centers_[lowest_index][0]) )


	#Example commands for tutorials
	#import numpy
	#numpy.linalg.norm( numpy.array([300, 2,3])-numpy.array([400, 1,6]) )

	#Make sure our inputs are floats instead of ints and standardized between 0 and 1
	#input = sklearn.preprocessing.MinMaxScaler().fit_transform( np.array( input ).astype(float))

	#To run program follow these simple steps
	#Download the module into the working directory and then type from the shell...

	"""
	#1. import kMeans
	#2. kmeans = kMeans.kMeans()
	#3. kmeans.solve()

	and you should get an output that looks like this


	Old and new centroids have converged after 3 iterations. Returning results...

	Cluster groupings:
	{0: [[303, 3, 1],
	[298, 3, 1],
	[277, 3, 1],
	[299, 3, 1],
	[303, 4, 1],
	[309, 3, 1],
	[311, 3, 1],
	[302, 3, 1],
	[305, 3, 1]],
	1: [[370, 1, 2],
	[377, 4, 2],
	[382, 1, 2],
	[374, 4, 6],
	[359, 1, 2],
	[366, 1, 4],
	[373, 4, 4]]}

	Centroids are located at

	{0: array([ 300.77777778, 3.11111111, 1. ]),
	1: array([ 371.57142857, 2.28571429, 3.14285714])}

	SKLearn KMeans: Centroids located at
	[[ 371.57142857 2.28571429 3.14285714]
	[ 300.77777778 3.11111111 1. ]]

	"""