Skip to content

Instantly share code, notes, and snippets.

@raymanfx
Created January 29, 2017 23:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save raymanfx/413e69fb669daeeb07f38e0ea86b6bba to your computer and use it in GitHub Desktop.
Save raymanfx/413e69fb669daeeb07f38e0ea86b6bba to your computer and use it in GitHub Desktop.
import math
"""
k-means algorithm
"""
data_set = [[2, 2],
[2, 1],
[1, 1],
[0, 0],
[-1, 0],
[-2, 0],
[-1, -1]]
c1 = []
c2 = []
c3 = []
cc1 = [1, 1]
cc2 = []
cc3 = []
def euclidian_distance(point1=[], point2=[]):
x1 = point1[0]
x2 = point2[0]
y1 = point1[1]
y2 = point2[1]
return math.sqrt(pow((x1-x2),2) + pow((y1-y2),2))
def assign_point_to_c(point=[]):
dist_c1 = euclidian_distance(point, cc1)
dist_c2 = euclidian_distance(point, cc2)
dist_c3 = euclidian_distance(point, cc3)
if dist_c1 < dist_c2 and dist_c1 < dist_c3:
c1.append(point)
elif dist_c2 < dist_c1 and dist_c2 < dist_c3:
c2.append(point)
elif dist_c3 < dist_c1 and dist_c3 < dist_c2:
c3.append(point)
else:
c1.append(point)
def recompute_ccs():
x_sum = 0
y_sum = 0
count = 0
for i in range(0, len(c1)):
point = c1[i]
x_sum += point[0]
y_sum += point[1]
count += 1
x_mean = x_sum / count
y_mean = y_sum / count
cc1[0] = x_mean
cc1[1] = y_mean
x_sum = 0
y_sum = 0
count = 0
for i in range(0, len(c2)):
point = c2[i]
x_sum += point[0]
y_sum += point[1]
count += 1
x_mean = x_sum / count
y_mean = y_sum / count
cc2[0] = x_mean
cc2[1] = y_mean
x_sum = 0
y_sum = 0
count = 0
for i in range(0, len(c3)):
point = c3[i]
x_sum += point[0]
y_sum += point[1]
count += 1
x_mean = x_sum / count
y_mean = y_sum / count
cc3[0] = x_mean
cc3[1] = y_mean
# find cc2
max_dist = 0
for i in range(0, len(data_set)):
point = data_set[i]
# skip cc1
if point == cc1:
continue
dist = euclidian_distance(point, cc1)
if dist > max_dist:
max_dist = dist
cc2 = [point[0], point[1]]
print("cc2 is: [" + str(cc2[0]) + ", " + str(cc2[1]) + "]");
# find cc3
max_dist1 = 0
max_dist2 = 0
for i in range(0, len(data_set)):
point = data_set[i]
# skip cc1
if point == cc1 or point == cc2:
continue
dist1 = euclidian_distance(point, cc1)
dist2 = euclidian_distance(point, cc2)
if dist1 > max_dist1 and dist2 > max_dist2:
max_dist1 = dist1
max_dist2 = dist2
cc3 = [point[0], point[1]]
print("cc3 is: [" + str(cc3[0]) + ", " + str(cc3[1]) + "]");
iterations = 0
abort = False
while abort != True:
iterations += 1
# reset clusters
c1 = []
c2 = []
c3 = []
for i in range(0, len(data_set)):
point = data_set[i]
assign_point_to_c(point)
print("[" + str(iterations) + "] ", end='')
print("instances in c1:")
for i in range(0, len(c1)):
point = c1[i]
print("[" + str(iterations) + "] ", end='')
print(str(i) + ": [" + str(point[0]) + ", " + str(point[1]) + "]");
print("[" + str(iterations) + "] ", end='')
print("instances in c2:")
for i in range(0, len(c2)):
point = c2[i]
print("[" + str(iterations) + "] ", end='')
print(str(i) + ": [" + str(point[0]) + ", " + str(point[1]) + "]");
print("[" + str(iterations) + "] ", end='')
print("instances in c3:")
for i in range(0, len(c3)):
point = c3[i]
print("[" + str(iterations) + "] ", end='')
print(str(i) + ": [" + str(point[0]) + ", " + str(point[1]) + "]");
recompute_ccs()
print("[" + str(iterations) + "] ", end='')
print("recomputed cluster centers:")
print("[" + str(iterations) + "] ", end='')
print("cc1" + ": [" + str(cc1[0]) + ", " + str(cc1[1]) + "]");
print("[" + str(iterations) + "] ", end='')
print("cc2" + ": [" + str(cc2[0]) + ", " + str(cc2[1]) + "]");
print("[" + str(iterations) + "] ", end='')
print("cc3" + ": [" + str(cc3[0]) + ", " + str(cc3[1]) + "]");
if iterations == 4:
abort = True
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment