Last active
August 29, 2015 14:03
-
-
Save leonaburime/10f03d208f980a4579f4 to your computer and use it in GitHub Desktop.
kMeans Algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random, pdb | |
from pprint import pprint | |
import pandas as pd, numpy as np | |
import sklearn | |
from sklearn.cluster import KMeans | |
labels = {0 : 'apple', 1 : 'banana'} | |
input = [ | |
# weight, color, # seeds | |
[303, 3, 1], | |
[370, 1, 2], | |
[298, 3, 1], | |
[277, 3, 1], | |
[377, 4, 2], | |
[299, 3, 1], | |
[382, 1, 2], | |
[374, 4, 6], | |
[303, 4, 1], | |
[309, 3, 1], | |
[359, 1, 2], | |
[366, 1, 4], | |
[311, 3, 1], | |
[302, 3, 1], | |
[373, 4, 4], | |
[305, 3, 1], | |
] | |
output = [ | |
#type | |
["banana"], | |
["apple"], | |
["banana"], | |
["banana"], | |
["apple"], | |
["banana"], | |
["apple"], | |
["apple"], | |
["banana"], | |
["banana"], | |
["apple"], | |
["apple"], | |
["banana"], | |
["banana"], | |
["apple"], | |
["banana"], | |
["apple"], | |
] | |
sample_predict = [371, 3, 6] | |
class kMeans: | |
def __init__(self): | |
self.data = input | |
self.k = 2 | |
self.max_iterations = 1000 | |
def solve(self, input=input, k = 2): | |
#Not standardizing the variables now to get a better idea of the values used to run kMeans | |
#input = sklearn.preprocessing.MinMaxScaler().fit_transform( np.array( input ).astype(float)) | |
#Sample answer 'centroids' would look like {0: [359, 1, 2], 1: [371, 3, 6]} | |
centroids = self.createRandomCentroids(input, k) | |
#Initializing book keeping vars and dummy 'oldCentroids' | |
iterations = 0 | |
oldCentroids = { k : [None] for k in centroids.keys()} | |
while not self.converged( oldCentroids, centroids, iterations ): | |
#Solve old centroids for convergence test. | |
oldCentroids = centroids | |
clusters = self.findClosestCentroid(input, centroids , { k:[] for k in range(k)} ) | |
centroids = self.getNewCentroids( clusters) | |
iterations += 1 | |
self.centroids = centroids | |
print "Cluster groupings:" | |
pprint ( clusters ) | |
print "\nCentroids are located at \n" | |
pprint ( centroids) | |
self.sk_solve(input, k) | |
def predict(self, input_variable=sample_predict): | |
#Get the euclidean distance of each datapoint with k centroids | |
dic = { k: np.linalg.norm( np.array(v)-np.array(input_variable) ) for k,v in self.centroids.items() } | |
#Get the lowest index of min value put that datapoint in the appropriate cluster | |
lowest_index = min( dic , key=dic.get) | |
print "\nInput %s belongs in the cluster with centroid mean %s" % ( input_variable, \ | |
str( self.centroids[lowest_index]) ) | |
self.sk_predict( input_variable); | |
return lowest_index | |
#Evaluate the centroids and get the 'average' position to reevaluate all the clusters | |
def getNewCentroids(self, clusters): | |
return { k: np.mean(clusters[k], axis = 0) for k in clusters.keys() } | |
def converged(self, oldCentroid, newCentroid, num_of_iterations ): | |
#Lets make sure I dont go on forever | |
if( num_of_iterations >= self.max_iterations ): | |
print "Maximum iterations reached. Returning results...\n" | |
return True | |
elif all( [True if set( oldCentroid[k] ) == set( newCentroid[k] ) else False for k in newCentroid.keys()] ) : | |
print "Old and new centroids have converged after %d iterations. Returning results...\n" % num_of_iterations | |
return True | |
return False | |
#Create k random centroids - We dont care where at | |
def createRandomCentroids(self, input, k=2): | |
return {k:v for k,v in enumerate( random.sample(input, k) ) } | |
def findClosestCentroid(self, input, centroids, clusters): | |
for x in input: | |
#Get the euclidean distance of each datapoint with k centroids | |
dic = { k: np.linalg.norm( np.array(v)-np.array(x) ) for k,v in centroids.items() } | |
#Get the lowest index of min value put that datapoint in the appropriate cluster | |
lowest_index = min( dic , key=dic.get) | |
clusters[lowest_index].append(x) | |
return clusters | |
def sk_solve(self, input, k=2): | |
self.SK_Kmeans = KMeans(n_clusters=k) | |
self.SK_Kmeans.fit(input) | |
print "\nSKLearn KMeans: Centroids located at \n%s \n" % str( self.SK_Kmeans.cluster_centers_ ) | |
def sk_predict(self, input_variable): | |
lowest_index = self.SK_Kmeans.predict(input_variable) | |
print "\nSKLearn KMeans : Input %s belongs in the cluster with centroid mean %s\n" % ( input_variable, \ | |
str( self.SK_Kmeans.cluster_centers_[lowest_index][0]) ) | |
#Example commands for tutorials | |
#import numpy | |
#numpy.linalg.norm( numpy.array([300, 2,3])-numpy.array([400, 1,6]) ) | |
#Make sure our inputs are floats instead of ints and standardized between 0 and 1 | |
#input = sklearn.preprocessing.MinMaxScaler().fit_transform( np.array( input ).astype(float)) | |
#To run program follow these simple steps | |
#Download the module into the working directory and then type from the shell... | |
""" | |
#1. import kMeans | |
#2. kmeans = kMeans.kMeans() | |
#3. kmeans.solve() | |
and you should get an output that looks like this | |
Old and new centroids have converged after 3 iterations. Returning results... | |
Cluster groupings: | |
{0: [[303, 3, 1], | |
[298, 3, 1], | |
[277, 3, 1], | |
[299, 3, 1], | |
[303, 4, 1], | |
[309, 3, 1], | |
[311, 3, 1], | |
[302, 3, 1], | |
[305, 3, 1]], | |
1: [[370, 1, 2], | |
[377, 4, 2], | |
[382, 1, 2], | |
[374, 4, 6], | |
[359, 1, 2], | |
[366, 1, 4], | |
[373, 4, 4]]} | |
Centroids are located at | |
{0: array([ 300.77777778, 3.11111111, 1. ]), | |
1: array([ 371.57142857, 2.28571429, 3.14285714])} | |
SKLearn KMeans: Centroids located at | |
[[ 371.57142857 2.28571429 3.14285714] | |
[ 300.77777778 3.11111111 1. ]] | |
""" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment