Skip to content

Instantly share code, notes, and snippets.

@aerinkim
Created February 24, 2019 23:05
Show Gist options
  • Save aerinkim/71ad7571e2224236a7d44460e8840497 to your computer and use it in GitHub Desktop.
Save aerinkim/71ad7571e2224236a7d44460e8840497 to your computer and use it in GitHub Desktop.
K-means Python Implementation from scratch
from sklearn import datasets
def Kmeans(X, K):
m = len(X)
X_centroid = dict() # Save which sample belong to which cluster.
X_centroid.fromkeys(range(0, m))
C = dict() # Save cluster's cordinate
C.fromkeys(range(0, K))
old_C = None # Cache to save old C. Used for an early termination.
# 1. Randomly initialize k centroids.
rand = random.sample(range(m), K)
for i in range(0, K):
C[i] = [X[rand[i]], 0]
# 2. Iterate 1000 times.
for iteration in range(0, 1000):
# 3. Assign the centroid for each sample. This line determines the computational complexity : m * K * d. (d is len(X[0]).)
for i in range(0, m):
dist = [np.linalg.norm(X[i] - C[j][0], 2) for j in range(0, K)]
X_centroid[i] = np.argmin(dist)
# 4. Calculate the new centroid cordinates (Notice I'm only reading the data once for the efficiency.)
temp_coordinates_counts = [[np.zeros(len(X[0])), 0] for _ in range(0, K)]
for i in range(0, m):
temp_coordinates_counts[X_centroid[i]][0] += X[i]
temp_coordinates_counts[X_centroid[i]][1] += 1
for i in range(0, K):
coordinates_sum = temp_coordinates_counts[i][0]
count = temp_coordinates_counts[i][1]
C[i] = [coordinates_sum / count, count]
if old_C == C: # Early termination
break
old_C = C
return X_centroid, C
if __name__ == '__main__':
np.random.seed(5)
iris = datasets.load_iris()
X = iris.data
K = 3
# From Scratch
X_centroid, C = Kmeans(X, K)
print("From Scratch: ", C)
# From SKLearn
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=K)
kmeans = kmeans.fit(X)
print("From SKlearn: ", kmeans.cluster_centers_)
# They match!
"""
<Note : Algorithm Properties>
1. K-means results are sensitive to initialization.
2. Complexity is O( m * K * I * d )
– m = number of points, K = number of clusters, I = number of iterations, d = number of features.
3. Guarantee: Monotonically decreases the objective function ("dist" variable in the code) until convergence.
4. Scalability : Every iteration is linear in the size of its input.
5. K-means is a special case of EM clustering. Think about it!
6. There is no accepted method to discover k!!!
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment