Skip to content

Instantly share code, notes, and snippets.

@aluenkinglee
Created September 1, 2014 08:59
Show Gist options
  • Save aluenkinglee/7d7a47f6f93c9353e0f7 to your computer and use it in GitHub Desktop.
Save aluenkinglee/7d7a47f6f93c9353e0f7 to your computer and use it in GitHub Desktop.
该文件是用来提取用户标签并形成”标签-用户“稀疏矩阵,然后使用聚合算法得到标签树状图
'''
Auth : King Lee
Time : 2014-08-20 23:48:30
this program is to find the similar weibo labels using hierarchical
cluster method on lable-user sparse matrix.
'''
from math import sqrt
from os import listdir
from os.path import isfile, join
import pickle
import numpy as np
import json
'''
read the 'users' dictory and get the labels' name and label-user matrix
@return sp: sparse matrix of labels-user
rownames: labels name
'''
def get_data():
dir_name = "users"
label_files = [join(dir_name, f) for f in listdir(dir_name) if isfile( join(dir_name, f) ) and f.find('_') == -1]
doc_index = {}
tag_cnt = {}
users_index = {}
tag_index = {}
index_tag = {}
index_usr = {}
index = 0
for file_name in label_files:
raw_content = open(file_name, 'r').read()
try:
content = json.loads(raw_content.strip())
except Exception, e:
label_files.remove(file_name)
continue
# get all the lables and users ,generate the user to index dictory
# uid to labels dictory
if content.has_key('labels') and len(content['labels']) != 0:
doc_index[content['uid']] = content['labels']
users_index[content['uid']] = index
index = index + 1
if content.has_key('userList'):
for follower_content in content['userList']:
if follower_content.has_key('labels') and len(follower_content['labels']) != 0:
doc_index[follower_content['uid']] = follower_content['labels']
users_index[follower_content['uid']] = index
index = index + 1
# statistic the tag cnt
for doc in doc_index.keys():
for tag in doc_index[doc]:
tag_cnt[tag] = tag_cnt.get(tag, 0) + 1
# find the tag cnt >= 10 the 10 is a cutoff
index = 0
for tag , cnt in tag_cnt.iteritems():
if cnt >10:
index_tag[index] = tag
tag_index[tag] = index
index += 1
# get the tag-user matrix to cluster by Hierarchical clustering
# sparse matrix
sp = {}
for uid in users_index.keys():
uindex = users_index[uid]
for tag in doc_index[uid]:
if(tag_index.has_key(tag)):
tindex = tag_index[tag]
if tindex not in sp.keys():
sp[tindex] = {}
sp[tindex][uindex] = sp[tindex].get(uindex,0) +1
colnames = [index_usr[i] for i in range(len(index_usr))]
rownames = [index_tag[i] for i in range(len(index_tag))]
# print len(sp),len(rownames),len(users_index)
return sp,rownames, colnames
'''
@param v1: dict format, {label:{usrid:cnt,....,}}
@param v2: dict format, {label:{usrid:cnt,....,}}
@return cos similarity
'''
def cosine(v1, v2):
# for sparse vector
# sums of the squares
sum1sq = sum([pow(v, 2) for v in v1.values()])
sum2sq = sum([pow(v, 2) for v in v2.values()])
l1 = [k for k in v1.keys()]
l1 += [k for k in v2.keys()]
keys = set(l1)
# sums of products
psum = sum(v1.get(k, 0) * v2.get(k, 0) for k in keys)
c = psum/(sqrt(sum1sq) * sqrt(sum2sq))
return c
'''
@param v1: list format, [usr1,usr2,...]
@param v2: list format [usr1,usr2,...]
@return cos similarity
'''
def cosine2(v1, v2):
# for list
# sums of the squares
sum1sq = sum([pow(1, 2) for v in range(len(v1))])
sum2sq = sum([pow(1, 2) for v in range(len(v2))])
l1 = [k for k in v1]
l2 = [k for k in v2]
# sums of products
psum = sum(1 for k in range(len(set(l1)&set(l2))))
c = psum/(sqrt(sum1sq) * sqrt(sum2sq))
return c
'''
binary cluster node using in the hierarchical clustering method
'''
class bicluster:
def __init__(self, vec, left=None, right=None, distance=0.0, id=None):
self.left = left
self.right = right
# vector is a dict
self.vec = vec
self.id = id
self.distance = distance
pass
'''
hierarchical clustering method
@param rows: label-user matrix,
@param distance: the similarity function,default is cosine
@return cluster
'''
def hcluster(rows, distance=cosine):
distances = {}
currentclusterid = -1
# init the cluster by row id
clust = [bicluster(rows[i], id =i) for i in rows.keys()]
while(len(clust) >1):
print "clustering....", len(clust)
lowestpair = (0,1)
closest = distance(clust[0].vec , clust[1].vec)
# loop for the smallest distance
for i in range(len(clust)):
for j in range(i+1,len(clust)):
if (clust[i].id , clust[j].id) not in distances:
distances[(clust[i].id, clust[j].id)] = distance(clust[i].vec, clust[j].vec)
d = distances[(clust[i].id, clust[j].id)]
if distance == pearson:
if d < closest:
closest = d
lowestpair = (i , j)
else:
if d > closest:
closest = d
lowestpair = (i , j)
if distance != cosine2:
# caculate the aver of two clusters vector(dict representation)
v1 = [k for k in clust[lowestpair[0]].vec.keys()]
v1 += [k for k in clust[lowestpair[1]].vec.keys()]
keys = set(v1)
mergevec = {}
for k in keys:
mergevec[k] = (clust[lowestpair[0]].vec.get(k, 0) +
clust[lowestpair[1]].vec.get(k,0)) / 2.0
del v1,keys
else:
# caculate the aver of two clusters vector(dict representation)
v1 = clust[lowestpair[0]].vec
v1 += clust[lowestpair[1]].vec
keys = set(v1)
mergevec = list(keys)
del v1,keys
# create new cluster node
newCluster = bicluster(mergevec, left=clust[lowestpair[0]],
right =clust[lowestpair[1]],
distance=closest,id=currentclusterid)
#cluster ids that weren`t in the set are negative
currentclusterid = currentclusterid -1
# first del the latter element lowestpair[1]
del clust[lowestpair[1]]
# then del the first element
del clust[lowestpair[0]]
clust.append(newCluster)
return clust[0]
'''
@param clust: the trained hierarchical cluster,
@param labels: id to labels dictory
@param n : depth of node in the tree
@param outputfile: write the result to the file
'''
def writeclust(clust, labels=None, n=0,outputfile=None):
for i in range(n):
outputfile.write(" ")
if clust.id<0 :
# dict1[n]=dict1.get(n,[])
# dict1[n].append(str(clust.distance))
# outputfile.write("+ %d %s\n" % (n, str(clust.distance)))
if clust.distance >0.1:
outputfile.write("+ %d %s\n" % (n, str(clust.distance)))
else:
outputfile.write("+ %d\n" % (n))
else:
if labels==None:
outputfile.write(" %d\n" % clust.id)
else:
# dict1[n]=dict1.get(n,[])
# dict1[n].append( labels[clust.id].encode('utf-8') )
# dict1[n].append( clust.vec )
outputfile.write("- %d %s\n" % (n, labels[clust.id].encode('utf-8')))
if clust.right!=None:
writeclust(clust.right, labels=labels, n=n+1, outputfile=outputfile)
if clust.left!=None:
writeclust(clust.left, labels=labels, n=n+1, outputfile=outputfile)
'''
@param clust: the trained hierarchical cluster,
@param labels: id to labels dictory
@param id : the ancestor of the similar lables which
the similarity is the first node greater than the cutoff
@param dic: global dic to store the info as:
{ancestor_id1: [label1,label2,...],
ancestor_id2:[label1,label2,...],
......}
'''
def helper(clust, labels,id,dic=None):
# note that only the leaf node'id is greater than 0
if clust.id>0:
cnt = sum(v for v in clust.vec.values())
label= labels[clust.id].encode('utf-8')
dic[id].append(label)
# print cnt, label.encode('utf-8')
return (cnt, label)
else:
if clust.right!=None:
(rcnt, rlabel) = helper(clust.right, labels, id, dic)
else:
(rcnt, rlabel) = (0,'')
if clust.left!=None:
(lcnt, llabel) = helper(clust.left, labels, id, dic)
else:
(lcnt, llabel) = (0,"")
maxpair =(0,'')
if rcnt > lcnt:
maxpair = (rcnt, rlabel)
else:
maxpair = (lcnt, llabel)
return maxpair
'''
@param clust: the trained hierarchical cluster,
@param labels: id to labels dictory
@param id : the ancestor of the similar lables which
the similarity is the first node greater than the cutoff
@param dic: global dic to store the info as:
{ancestor_id1: [label1,label2,...],
ancestor_id2:[label1,label2,...],
......}
'''
def toDict(clust, labels=None, dic=None):
# note that only the leaf node'id is greater than 0
if clust.id<0 :
# find the cnetroid in the tree node
# this can be a param in the future
if clust.distance >0.1:
dic[clust.id] = []
# 找到引用次数最多的标签
(cnt, label) = helper(clust,labels, clust.id, dic)
dic[label] = dic[clust.id]
# 删除以id为key的例子
del dic[clust.id]
else:
if clust.right!=None:
toDict(clust.right,labels,dic)
if clust.left!=None:
toDict(clust.left,labels, dic)
def printDict(dict,filename):
logfile =open(filename,"w+")
for k, v in dict.iteritems():
logfile.write(k+":")
for item in v:
logfile.write("%s "% item)
logfile.write("\n")
logfile.close()
def printDict(dict):
for k, v in dict.iteritems():
print k,":",
for item in v:
print item,
print
def save(cluster):
file = open('cluster.pkl','wb')
pickle.dump(cluster,file)
file.close()
def load():
try:
file = open('cluster.pkl','rb')
cluster = pickle.load(file)
file.close()
except Exception, e:
print e
cluster = None
return cluster
def main():
data, rownames, colnames = get_data()
print len(rownames)
test("hc821-1",data,rownames,cosine)
ldata = {k: v.keys() for k,v in data.iteritems() }
test("hc821-2",ldata,rownames,cosine2)
'''
{ancestor_id1: [label1,label2,...],
ancestor_id2:[label1,label2,...],
......}
'''
def test(filename, data, rownames,distance):
cluster = hcluster(data,distance)
logfile =open(filename,"w+")
writeclust(cluster, labels=rownames, n=0,outputfile=logfile)
logfile.close()
dic={}
if __name__ == '__main__':
try:
cluster = load()
except Exception, e:
print e,"now clustering.."
data, rownames, colnames = get_data()
cluster = hcluster(data,cosine)
toDict(cluster, rownames, dic)
printDict(dic, "result")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment