Skip to content

Instantly share code, notes, and snippets.

@wy36101299
Last active August 29, 2015 14:12
Show Gist options
  • Save wy36101299/23a125a09a0904c24a75 to your computer and use it in GitHub Desktop.
Save wy36101299/23a125a09a0904c24a75 to your computer and use it in GitHub Desktop.
Hierarchical algorithm
import math
import numpy as np
class point:
def __init__(self,vfeature,vlabel):
self.feature = vfeature
self.label = vlabel
class cluster:
def __init__(self,numbercluster):
self.numbercluster = numbercluster
self.cluster = {}
# 算出兩兩point的集合
def setDistance(points):
pointsDistance=[]
copypoints = list(points)
for a in points:
for b in copypoints:
if a != b:
pointsDistance.append([a,b])
copypoints.remove(a)
return pointsDistance
# 看每個點的分類
def show():
for index, item in enumerate(points):
t=""
for findex, fitem in enumerate(item.feature):
t += str(findex)+':'+str(fitem)+'\t'
print(str(index)+':'+"分類"+str(item.label)+"\tfeature-"+str(t))
# 每一筆data
points=[]
#每一次分類的紀錄
timeclusterlist=[]
# 載入data
filenamme = '/Users/wy/Desktop/hdata.txt'
with open (filenamme,'r') as f:
lines = f.readlines()
for index, line in enumerate(lines):
line=line.strip()
listLine = map(float,line.split(','))
index = point(listLine,str(index))
points.append(index)
# 分類的number
linenumber = len(points)-1
# 運行Hierarchical的次數
clusternumber = len(points)
# 幾個feature
dimension = len(listLine)
# 把object轉為number 再算兩兩point距離的集合
def setcluster():
# 第一次 要初始化
if len(timeclusterlist) ==0:
d={}
clustertmp = cluster(clusternumber)
diccluster = clustertmp.cluster
for index, item in enumerate(points):
d[points[index].label] = d.get(points[index].label,[])
d[points[index].label].append(item)
clustertmp.cluster = d
timeclusterlist.append(clustertmp)
ltmp=[]
for a in timeclusterlist[len(timeclusterlist)-1].cluster.keys():
ltmp.append(a)
setdistance = setDistance(ltmp)
else:
ltmp=[]
for a in timeclusterlist[len(timeclusterlist)-1].cluster.keys():
ltmp.append(a)
setdistance = setDistance(ltmp)
return setdistance
def Hierarchical(linenumber,clusternumber):
setdistance = setcluster()
clusterDict = timeclusterlist[len(timeclusterlist)-1].cluster
tp=[]
# 找出最短路徑的集合
for point in setdistance:
listPointA = clusterDict[point[0]]
listPointB = clusterDict[point[1]]
listNumberA = len(listPointA)
listNumberB = len(listPointB)
# A團有兩個點以上,B只有一個點
if listNumberA > 1 and listNumberB == 1:
listError = []
for point in range(listNumberA):
sqError = 0
for indexfea in range(dimension):
sqError += ( listPointA[point].feature[indexfea]-listPointB[0].feature[indexfea] )**2
listerror.append(sqError)
tp.append(min(listError))
# print('a')
# A只有一個點,B團有兩個點以上
elif listNumberA == 1 and listNumberB > 1:
listError = []
for point in range(listNumberB):
sqError = 0
for indexfea in range(dimension):
sqError += ( listPointB[point].feature[indexfea]-listPointA[0].feature[indexfea] )**2
listError.append(sqError)
tp.append(min(listError))
# print('b')
# A團有兩個點以上,B團有兩個點以上
elif listNumberA > 1 and listNumberB > 1:
listError = []
for pointa in range(listNumberA):
for pointb in range(listNumberB):
sqError = 0
for indexfea in range(dimension):
sqError += ( listPointA[pointa].feature[indexfea]-listPointB[pointb].feature[indexfea] )**2
listError.append(sqError)
tp.append(min(listError))
# print('c')
# A只有一個點,B只有一個點
elif listNumberA == 1 and listNumberB == 1:
sqError = 0
for indexfea in range(dimension):
sqError += (listPointA[0].feature[indexfea]-listPointB[0].feature[indexfea])**2
tp.append(sqError)
# print('d')
else:
print('error')
indSetdistance = tp.index(min(tp))
# 印出該距離最小值
print(tp)
# 兩兩point的集合
print(setdistance)
# 第N個最短距離之集合
print(indSetdistance)
# 最短距離,point的聚合
listPointA = clusterDict[setdistance[indSetdistance][0]]
listPointB = clusterDict[setdistance[indSetdistance][1]]
listNumberA = len(listPointA)
listNumberB = len(listPointB)
if listNumberA > 1:
for point in range(listNumberA):
listPointA[point].label = str(linenumber)
else:
listPointA[0].label = str(linenumber)
if listNumberB > 1:
for point in range(listNumberB):
listPointB[point].label = str(linenumber)
else:
listPointB[0].label = str(linenumber)
# timeclusterlist存入分類紀錄
d={}
clustertmp = cluster(clusternumber)
diccluster = clustertmp.cluster
for index, item in enumerate(points):
d[points[index].label] = d.get(points[index].label,[])
d[points[index].label].append(item)
clustertmp.cluster = d
timeclusterlist.append(clustertmp)
# 列出cluster情形
show()
# run
for time in range(clusternumber-1):
linenumber+=1
Hierarchical(linenumber,time-1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment