Skip to content

Instantly share code, notes, and snippets.

@wy36101299
Last active August 29, 2015 14:10
Show Gist options
  • Save wy36101299/5a8024fa7c26db235e97 to your computer and use it in GitHub Desktop.
Save wy36101299/5a8024fa7c26db235e97 to your computer and use it in GitHub Desktop.
k-means
%pylab inline
import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
class point:
def __init__(self,dimension,pmin,pmax):
self.feature = []
self.label = None
for a in range(dimension):
a = random.randint(pmin,pmax)
self.feature.append(a)
class label:
def __init__(self,dimension,pmin,pmax):
self.feature = []
self.cluster = []
for a in range(dimension):
a = random.randint(pmin,pmax)
self.feature.append(a)
# 初始化點 預設 dimension=2,min=1,max=1000
def initialization_point(sumpoint):
points=[]
for a in range(sumpoint):
a = point(2,1,1000)
points.append(a)
return points
# 初始化label 預設 dimension=2,min=1,max=1000
def initialization_label(k):
labels=[]
for a in range(k):
a = label(2,1,1000)
labels.append(a)
return labels
# 預設 200的點
points = initialization_point(10)
# 預設 k = 2
labels = initialization_label(2)
def plot():
lll=[]
lll2=[]
for g in range(len(labels)):
num = len(labels[g].cluster)
if g ==0:
for v in labels[g].cluster:
lll.append([v.feature[0],v.feature[1]])
if g ==1:
for v in labels[g].cluster:
lll2.append([v.feature[0],v.feature[1]])
df = pd.DataFrame(lll, columns=['a', 'b'])
df2 = pd.DataFrame(lll2, columns=['c', 'd'])
ax = df.plot(kind='scatter', x='a', y='b',color='DarkBlue', label='Group 1');
df2.plot(kind='scatter', x='c', y='d',color='DarkGreen', label='Group 2',ax=ax);
def kmeans():
# 比較收斂之list
pre=[]
# step1: cluster assignment
for a in range(len(points)):
# 計算最小的距離之list
tp=[]
for b in range(len(labels)):
# hypot(x,y) = sqrt(x*x + y*y)
tpoints = math.hypot(labels[b].feature[0]-points[a].feature[0] , labels[b].feature[1]-points[a].feature[1])
tp.append(tpoints)
points[a].label = tp.index(min(tp))
# labes 加入 被分配的點
labels[ tp.index(min(tp)) ].cluster.append(points[a])
# 把所有label加進去,藉由比對label是否有更新來決定converge
pre.append(points[a].label)
plot()
# step2: move centroid
for a in range(len(labels)):
if len(labels[a].cluster) !=0:
temp1=0
temp2=0
for b in range(len( labels[a].cluster )):
temp1+=labels[a].cluster[b].feature[0]
temp2+=labels[a].cluster[b].feature[1]
labels[a].feature[0]=float(temp1)/float(len(labels[a].cluster))
labels[a].feature[1]=float(temp2)/float(len(labels[a].cluster))
# 清空 label 的 cluster 便於之後重新分配點的label
labels[a].cluster=[]
# for g in range(len(label)):
# print(str(g)+':'+str(label[g].points[0])+str(label[g].points[1]))
return pre
pre = kmeans()
# plot()
count=1
while pre != kmeans():
# plot()
count+=1
pre = kmeans()
print(count)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment