Skip to content

Instantly share code, notes, and snippets.

@pmsosa
Last active May 16, 2023 17:37
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 6 You must be signed in to fork a gist
  • Save pmsosa/5454ade527adbee105dd51066ef30c5f to your computer and use it in GitHub Desktop.
Save pmsosa/5454ade527adbee105dd51066ef30c5f to your computer and use it in GitHub Desktop.
K-Means Clustering Implementation
##############################################################################################
### K-Means Python Implementation ###
### http://konukoii.com/blog/2017/01/15/5-min-tutorial-k-means-clustering-in-python/ ###
##############################################################################################
import random
import math
#Euclidian Distance between two d-dimensional points
def eucldist(p0,p1):
dist = 0.0
for i in range(0,len(p0)):
dist += (p0[i] - p1[i])**2
return math.sqrt(dist)
#K-Means Algorithm
def kmeans(k,datapoints):
# d - Dimensionality of Datapoints
d = len(datapoints[0])
#Limit our iterations
Max_Iterations = 1000
i = 0
cluster = [0] * len(datapoints)
prev_cluster = [-1] * len(datapoints)
#Randomly Choose Centers for the Clusters
cluster_centers = []
for i in range(0,k):
new_cluster = []
#for i in range(0,d):
# new_cluster += [random.randint(0,10)]
cluster_centers += [random.choice(datapoints)]
#Sometimes The Random points are chosen poorly and so there ends up being empty clusters
#In this particular implementation we want to force K exact clusters.
#To take this feature off, simply take away "force_recalculation" from the while conditional.
force_recalculation = False
while (cluster != prev_cluster) or (i > Max_Iterations) or (force_recalculation) :
prev_cluster = list(cluster)
force_recalculation = False
i += 1
#Update Point's Cluster Alligiance
for p in range(0,len(datapoints)):
min_dist = float("inf")
#Check min_distance against all centers
for c in range(0,len(cluster_centers)):
dist = eucldist(datapoints[p],cluster_centers[c])
if (dist < min_dist):
min_dist = dist
cluster[p] = c # Reassign Point to new Cluster
#Update Cluster's Position
for k in range(0,len(cluster_centers)):
new_center = [0] * d
members = 0
for p in range(0,len(datapoints)):
if (cluster[p] == k): #If this point belongs to the cluster
for j in range(0,d):
new_center[j] += datapoints[p][j]
members += 1
for j in range(0,d):
if members != 0:
new_center[j] = new_center[j] / float(members)
#This means that our initial random assignment was poorly chosen
#Change it to a new datapoint to actually force k clusters
else:
new_center = random.choice(datapoints)
force_recalculation = True
print "Forced Recalculation..."
cluster_centers[k] = new_center
print "======== Results ========"
print "Clusters", cluster_centers
print "Iterations",i
print "Assignments", cluster
#TESTING THE PROGRAM#
if __name__ == "__main__":
#2D - Datapoints List of n d-dimensional vectors. (For this example I already set up 2D Tuples)
#Feel free to change to whatever size tuples you want...
datapoints = [(3,2),(2,2),(1,2),(0,1),(1,0),(1,1),(5,6),(7,7),(9,10),(11,13),(12,12),(12,13),(13,13)]
k = 2 # K - Number of Clusters
kmeans(k,datapoints)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment