Skip to content

Instantly share code, notes, and snippets.

@leonaburime
Last active August 29, 2015 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save leonaburime/10f03d208f980a4579f4 to your computer and use it in GitHub Desktop.
Save leonaburime/10f03d208f980a4579f4 to your computer and use it in GitHub Desktop.
kMeans Algorithm
import random, pdb
from pprint import pprint
import pandas as pd, numpy as np
import sklearn
from sklearn.cluster import KMeans
labels = {0 : 'apple', 1 : 'banana'}
input = [
# weight, color, # seeds
[303, 3, 1],
[370, 1, 2],
[298, 3, 1],
[277, 3, 1],
[377, 4, 2],
[299, 3, 1],
[382, 1, 2],
[374, 4, 6],
[303, 4, 1],
[309, 3, 1],
[359, 1, 2],
[366, 1, 4],
[311, 3, 1],
[302, 3, 1],
[373, 4, 4],
[305, 3, 1],
]
output = [
#type
["banana"],
["apple"],
["banana"],
["banana"],
["apple"],
["banana"],
["apple"],
["apple"],
["banana"],
["banana"],
["apple"],
["apple"],
["banana"],
["banana"],
["apple"],
["banana"],
["apple"],
]
sample_predict = [371, 3, 6]
class kMeans:
def __init__(self):
self.data = input
self.k = 2
self.max_iterations = 1000
def solve(self, input=input, k = 2):
#Not standardizing the variables now to get a better idea of the values used to run kMeans
#input = sklearn.preprocessing.MinMaxScaler().fit_transform( np.array( input ).astype(float))
#Sample answer 'centroids' would look like {0: [359, 1, 2], 1: [371, 3, 6]}
centroids = self.createRandomCentroids(input, k)
#Initializing book keeping vars and dummy 'oldCentroids'
iterations = 0
oldCentroids = { k : [None] for k in centroids.keys()}
while not self.converged( oldCentroids, centroids, iterations ):
#Solve old centroids for convergence test.
oldCentroids = centroids
clusters = self.findClosestCentroid(input, centroids , { k:[] for k in range(k)} )
centroids = self.getNewCentroids( clusters)
iterations += 1
self.centroids = centroids
print "Cluster groupings:"
pprint ( clusters )
print "\nCentroids are located at \n"
pprint ( centroids)
self.sk_solve(input, k)
def predict(self, input_variable=sample_predict):
#Get the euclidean distance of each datapoint with k centroids
dic = { k: np.linalg.norm( np.array(v)-np.array(input_variable) ) for k,v in self.centroids.items() }
#Get the lowest index of min value put that datapoint in the appropriate cluster
lowest_index = min( dic , key=dic.get)
print "\nInput %s belongs in the cluster with centroid mean %s" % ( input_variable, \
str( self.centroids[lowest_index]) )
self.sk_predict( input_variable);
return lowest_index
#Evaluate the centroids and get the 'average' position to reevaluate all the clusters
def getNewCentroids(self, clusters):
return { k: np.mean(clusters[k], axis = 0) for k in clusters.keys() }
def converged(self, oldCentroid, newCentroid, num_of_iterations ):
#Lets make sure I dont go on forever
if( num_of_iterations >= self.max_iterations ):
print "Maximum iterations reached. Returning results...\n"
return True
elif all( [True if set( oldCentroid[k] ) == set( newCentroid[k] ) else False for k in newCentroid.keys()] ) :
print "Old and new centroids have converged after %d iterations. Returning results...\n" % num_of_iterations
return True
return False
#Create k random centroids - We dont care where at
def createRandomCentroids(self, input, k=2):
return {k:v for k,v in enumerate( random.sample(input, k) ) }
def findClosestCentroid(self, input, centroids, clusters):
for x in input:
#Get the euclidean distance of each datapoint with k centroids
dic = { k: np.linalg.norm( np.array(v)-np.array(x) ) for k,v in centroids.items() }
#Get the lowest index of min value put that datapoint in the appropriate cluster
lowest_index = min( dic , key=dic.get)
clusters[lowest_index].append(x)
return clusters
def sk_solve(self, input, k=2):
self.SK_Kmeans = KMeans(n_clusters=k)
self.SK_Kmeans.fit(input)
print "\nSKLearn KMeans: Centroids located at \n%s \n" % str( self.SK_Kmeans.cluster_centers_ )
def sk_predict(self, input_variable):
lowest_index = self.SK_Kmeans.predict(input_variable)
print "\nSKLearn KMeans : Input %s belongs in the cluster with centroid mean %s\n" % ( input_variable, \
str( self.SK_Kmeans.cluster_centers_[lowest_index][0]) )
#Example commands for tutorials
#import numpy
#numpy.linalg.norm( numpy.array([300, 2,3])-numpy.array([400, 1,6]) )
#Make sure our inputs are floats instead of ints and standardized between 0 and 1
#input = sklearn.preprocessing.MinMaxScaler().fit_transform( np.array( input ).astype(float))
#To run program follow these simple steps
#Download the module into the working directory and then type from the shell...
"""
#1. import kMeans
#2. kmeans = kMeans.kMeans()
#3. kmeans.solve()
and you should get an output that looks like this
Old and new centroids have converged after 3 iterations. Returning results...
Cluster groupings:
{0: [[303, 3, 1],
[298, 3, 1],
[277, 3, 1],
[299, 3, 1],
[303, 4, 1],
[309, 3, 1],
[311, 3, 1],
[302, 3, 1],
[305, 3, 1]],
1: [[370, 1, 2],
[377, 4, 2],
[382, 1, 2],
[374, 4, 6],
[359, 1, 2],
[366, 1, 4],
[373, 4, 4]]}
Centroids are located at
{0: array([ 300.77777778, 3.11111111, 1. ]),
1: array([ 371.57142857, 2.28571429, 3.14285714])}
SKLearn KMeans: Centroids located at
[[ 371.57142857 2.28571429 3.14285714]
[ 300.77777778 3.11111111 1. ]]
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment