wy36101299/k-means.py

## k-means.py
%pylab inline
import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

class point:
    def __init__(self,dimension,pmin,pmax):
        self.feature = []
        self.label = None
        for a in range(dimension):
            a = random.randint(pmin,pmax)
            self.feature.append(a)

class label:
    def __init__(self,dimension,pmin,pmax):
        self.feature = []
        self.cluster = []
        for a in range(dimension):
            a = random.randint(pmin,pmax)
            self.feature.append(a)

# 初始化點 預設 dimension=2,min=1,max=1000
def initialization_point(sumpoint):
    points=[]
    for a in range(sumpoint):
        a = point(2,1,1000)
        points.append(a)
    return points

# 初始化label 預設 dimension=2,min=1,max=1000
def initialization_label(k):
    labels=[]
    for a in range(k):
        a = label(2,1,1000)
        labels.append(a)
    return labels

# 預設 200的點
points = initialization_point(10)

# 預設 k = 2
labels = initialization_label(2)
def plot():
    lll=[]
    lll2=[]
    for g in range(len(labels)):
        num = len(labels[g].cluster)
        if g ==0:
            for v in labels[g].cluster:
                lll.append([v.feature[0],v.feature[1]])
        if g ==1:
            for v in labels[g].cluster:
                lll2.append([v.feature[0],v.feature[1]])
    df = pd.DataFrame(lll, columns=['a', 'b'])
    df2 = pd.DataFrame(lll2, columns=['c', 'd'])

    ax = df.plot(kind='scatter', x='a', y='b',color='DarkBlue', label='Group 1');
    df2.plot(kind='scatter', x='c', y='d',color='DarkGreen', label='Group 2',ax=ax);
def kmeans():
    # 比較收斂之list
    pre=[]

    # step1: cluster assignment
    for a in range(len(points)):
        # 計算最小的距離之list
        tp=[]
        for b in range(len(labels)):
            # hypot(x,y) = sqrt(x*x + y*y)
            tpoints = math.hypot(labels[b].feature[0]-points[a].feature[0] , labels[b].feature[1]-points[a].feature[1])
            tp.append(tpoints)
        points[a].label = tp.index(min(tp))
        # labes 加入 被分配的點
        labels[ tp.index(min(tp)) ].cluster.append(points[a])
        # 把所有label加進去，藉由比對label是否有更新來決定converge
        pre.append(points[a].label)
    plot()
    # step2: move centroid
    for a in range(len(labels)):
        if len(labels[a].cluster) !=0:
            temp1=0
            temp2=0
            for b in range(len( labels[a].cluster )):
                temp1+=labels[a].cluster[b].feature[0]
                temp2+=labels[a].cluster[b].feature[1]
            labels[a].feature[0]=float(temp1)/float(len(labels[a].cluster))
            labels[a].feature[1]=float(temp2)/float(len(labels[a].cluster))

        # 清空 label 的 cluster 便於之後重新分配點的label
        labels[a].cluster=[]

#     for g in range(len(label)):
#         print(str(g)+':'+str(label[g].points[0])+str(label[g].points[1]))
    return pre
pre = kmeans()
# plot()
count=1
while pre != kmeans():
#     plot()
    count+=1
    pre = kmeans()
print(count)
	%pylab inline
	import math
	import random
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt

	class point:
	def __init__(self,dimension,pmin,pmax):
	self.feature = []
	self.label = None
	for a in range(dimension):
	a = random.randint(pmin,pmax)
	self.feature.append(a)

	class label:
	def __init__(self,dimension,pmin,pmax):
	self.feature = []
	self.cluster = []
	for a in range(dimension):
	a = random.randint(pmin,pmax)
	self.feature.append(a)

	# 初始化點預設 dimension=2,min=1,max=1000
	def initialization_point(sumpoint):
	points=[]
	for a in range(sumpoint):
	a = point(2,1,1000)
	points.append(a)
	return points

	# 初始化label 預設 dimension=2,min=1,max=1000
	def initialization_label(k):
	labels=[]
	for a in range(k):
	a = label(2,1,1000)
	labels.append(a)
	return labels

	# 預設 200的點
	points = initialization_point(10)

	# 預設 k = 2
	labels = initialization_label(2)
	def plot():
	lll=[]
	lll2=[]
	for g in range(len(labels)):
	num = len(labels[g].cluster)
	if g ==0:
	for v in labels[g].cluster:
	lll.append([v.feature[0],v.feature[1]])
	if g ==1:
	for v in labels[g].cluster:
	lll2.append([v.feature[0],v.feature[1]])
	df = pd.DataFrame(lll, columns=['a', 'b'])
	df2 = pd.DataFrame(lll2, columns=['c', 'd'])

	ax = df.plot(kind='scatter', x='a', y='b',color='DarkBlue', label='Group 1');
	df2.plot(kind='scatter', x='c', y='d',color='DarkGreen', label='Group 2',ax=ax);
	def kmeans():
	# 比較收斂之list
	pre=[]

	# step1: cluster assignment
	for a in range(len(points)):
	# 計算最小的距離之list
	tp=[]
	for b in range(len(labels)):
	# hypot(x,y) = sqrt(xx + yy)
	tpoints = math.hypot(labels[b].feature[0]-points[a].feature[0] , labels[b].feature[1]-points[a].feature[1])
	tp.append(tpoints)
	points[a].label = tp.index(min(tp))
	# labes 加入被分配的點
	labels[ tp.index(min(tp)) ].cluster.append(points[a])
	# 把所有label加進去，藉由比對label是否有更新來決定converge
	pre.append(points[a].label)
	plot()
	# step2: move centroid
	for a in range(len(labels)):
	if len(labels[a].cluster) !=0:
	temp1=0
	temp2=0
	for b in range(len( labels[a].cluster )):
	temp1+=labels[a].cluster[b].feature[0]
	temp2+=labels[a].cluster[b].feature[1]
	labels[a].feature[0]=float(temp1)/float(len(labels[a].cluster))
	labels[a].feature[1]=float(temp2)/float(len(labels[a].cluster))

	# 清空 label 的 cluster 便於之後重新分配點的label
	labels[a].cluster=[]

	# for g in range(len(label)):
	# print(str(g)+':'+str(label[g].points[0])+str(label[g].points[1]))
	return pre
	pre = kmeans()
	# plot()
	count=1
	while pre != kmeans():
	# plot()
	count+=1
	pre = kmeans()
	print(count)