Skip to content

Instantly share code, notes, and snippets.

@nicholaskajoh
Last active May 1, 2018 08:01
Show Gist options
  • Save nicholaskajoh/a667e20925abfb5f8ad9bf748b5092fe to your computer and use it in GitHub Desktop.
Save nicholaskajoh/a667e20925abfb5f8ad9bf748b5092fe to your computer and use it in GitHub Desktop.
K Means algorithm in Python
import numpy as np
import matplotlib.pyplot as plt
data = np.array([
[4, 3], [0, 0], [2, 4], [3, 4], [5, 4], [-2, 1], [-3, 0], [-3, -3], [8, 12], [11, 11], [9, 10]
])
K = 3
tol = 0.001
max_iter = 25
centroids = {}
for i in range(K):
centroids[i] = data[i]
groups = {}
for i in range(max_iter):
# step 1
for j in range(K):
groups[j] = []
# step 2
for feature_set in data:
# step 2.1
distances = [np.linalg.norm(feature_set - centroids[centroid_key]) for centroid_key in centroids]
# step 2.2
group = distances.index(min(distances))
groups[group].append(feature_set)
# step 3
old_centroids = dict(centroids)
# step 4
for j in range(K):
centroids[j] = np.average(groups[j], axis=0)
# step 5
optimized = True
for centroid_key in centroids:
old_centroid = old_centroids[centroid_key]
new_centroid = centroids[centroid_key]
a = np.array(new_centroid - old_centroid)
b = np.array(old_centroid)
change = np.divide(a, b, out=np.zeros_like(a), where=b!=0)
if abs(np.sum(change * 100.0)) > tol:
optimized = False
break
if optimized:
break
# visualize with matplotlib
plt.scatter([i[0] for i in groups[0]], [i[1] for i in groups[0]])
plt.scatter([i[0] for i in groups[1]], [i[1] for i in groups[1]])
plt.scatter([i[0] for i in groups[2]], [i[1] for i in groups[2]])
plt.scatter(centroids[0][0], centroids[0][1], color='r', marker='*')
plt.scatter(centroids[1][0], centroids[1][1], color='r', marker='*')
plt.scatter(centroids[2][0], centroids[2][1], color='r', marker='*')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment