Created
September 1, 2014 08:59
-
-
Save aluenkinglee/7d7a47f6f93c9353e0f7 to your computer and use it in GitHub Desktop.
该文件是用来提取用户标签并形成”标签-用户“稀疏矩阵,然后使用聚合算法得到标签树状图
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Auth : King Lee | |
Time : 2014-08-20 23:48:30 | |
this program is to find the similar weibo labels using hierarchical | |
cluster method on lable-user sparse matrix. | |
''' | |
from math import sqrt | |
from os import listdir | |
from os.path import isfile, join | |
import pickle | |
import numpy as np | |
import json | |
''' | |
read the 'users' dictory and get the labels' name and label-user matrix | |
@return sp: sparse matrix of labels-user | |
rownames: labels name | |
''' | |
def get_data(): | |
dir_name = "users" | |
label_files = [join(dir_name, f) for f in listdir(dir_name) if isfile( join(dir_name, f) ) and f.find('_') == -1] | |
doc_index = {} | |
tag_cnt = {} | |
users_index = {} | |
tag_index = {} | |
index_tag = {} | |
index_usr = {} | |
index = 0 | |
for file_name in label_files: | |
raw_content = open(file_name, 'r').read() | |
try: | |
content = json.loads(raw_content.strip()) | |
except Exception, e: | |
label_files.remove(file_name) | |
continue | |
# get all the lables and users ,generate the user to index dictory | |
# uid to labels dictory | |
if content.has_key('labels') and len(content['labels']) != 0: | |
doc_index[content['uid']] = content['labels'] | |
users_index[content['uid']] = index | |
index = index + 1 | |
if content.has_key('userList'): | |
for follower_content in content['userList']: | |
if follower_content.has_key('labels') and len(follower_content['labels']) != 0: | |
doc_index[follower_content['uid']] = follower_content['labels'] | |
users_index[follower_content['uid']] = index | |
index = index + 1 | |
# statistic the tag cnt | |
for doc in doc_index.keys(): | |
for tag in doc_index[doc]: | |
tag_cnt[tag] = tag_cnt.get(tag, 0) + 1 | |
# find the tag cnt >= 10 the 10 is a cutoff | |
index = 0 | |
for tag , cnt in tag_cnt.iteritems(): | |
if cnt >10: | |
index_tag[index] = tag | |
tag_index[tag] = index | |
index += 1 | |
# get the tag-user matrix to cluster by Hierarchical clustering | |
# sparse matrix | |
sp = {} | |
for uid in users_index.keys(): | |
uindex = users_index[uid] | |
for tag in doc_index[uid]: | |
if(tag_index.has_key(tag)): | |
tindex = tag_index[tag] | |
if tindex not in sp.keys(): | |
sp[tindex] = {} | |
sp[tindex][uindex] = sp[tindex].get(uindex,0) +1 | |
colnames = [index_usr[i] for i in range(len(index_usr))] | |
rownames = [index_tag[i] for i in range(len(index_tag))] | |
# print len(sp),len(rownames),len(users_index) | |
return sp,rownames, colnames | |
''' | |
@param v1: dict format, {label:{usrid:cnt,....,}} | |
@param v2: dict format, {label:{usrid:cnt,....,}} | |
@return cos similarity | |
''' | |
def cosine(v1, v2): | |
# for sparse vector | |
# sums of the squares | |
sum1sq = sum([pow(v, 2) for v in v1.values()]) | |
sum2sq = sum([pow(v, 2) for v in v2.values()]) | |
l1 = [k for k in v1.keys()] | |
l1 += [k for k in v2.keys()] | |
keys = set(l1) | |
# sums of products | |
psum = sum(v1.get(k, 0) * v2.get(k, 0) for k in keys) | |
c = psum/(sqrt(sum1sq) * sqrt(sum2sq)) | |
return c | |
''' | |
@param v1: list format, [usr1,usr2,...] | |
@param v2: list format [usr1,usr2,...] | |
@return cos similarity | |
''' | |
def cosine2(v1, v2): | |
# for list | |
# sums of the squares | |
sum1sq = sum([pow(1, 2) for v in range(len(v1))]) | |
sum2sq = sum([pow(1, 2) for v in range(len(v2))]) | |
l1 = [k for k in v1] | |
l2 = [k for k in v2] | |
# sums of products | |
psum = sum(1 for k in range(len(set(l1)&set(l2)))) | |
c = psum/(sqrt(sum1sq) * sqrt(sum2sq)) | |
return c | |
''' | |
binary cluster node using in the hierarchical clustering method | |
''' | |
class bicluster: | |
def __init__(self, vec, left=None, right=None, distance=0.0, id=None): | |
self.left = left | |
self.right = right | |
# vector is a dict | |
self.vec = vec | |
self.id = id | |
self.distance = distance | |
pass | |
''' | |
hierarchical clustering method | |
@param rows: label-user matrix, | |
@param distance: the similarity function,default is cosine | |
@return cluster | |
''' | |
def hcluster(rows, distance=cosine): | |
distances = {} | |
currentclusterid = -1 | |
# init the cluster by row id | |
clust = [bicluster(rows[i], id =i) for i in rows.keys()] | |
while(len(clust) >1): | |
print "clustering....", len(clust) | |
lowestpair = (0,1) | |
closest = distance(clust[0].vec , clust[1].vec) | |
# loop for the smallest distance | |
for i in range(len(clust)): | |
for j in range(i+1,len(clust)): | |
if (clust[i].id , clust[j].id) not in distances: | |
distances[(clust[i].id, clust[j].id)] = distance(clust[i].vec, clust[j].vec) | |
d = distances[(clust[i].id, clust[j].id)] | |
if distance == pearson: | |
if d < closest: | |
closest = d | |
lowestpair = (i , j) | |
else: | |
if d > closest: | |
closest = d | |
lowestpair = (i , j) | |
if distance != cosine2: | |
# caculate the aver of two clusters vector(dict representation) | |
v1 = [k for k in clust[lowestpair[0]].vec.keys()] | |
v1 += [k for k in clust[lowestpair[1]].vec.keys()] | |
keys = set(v1) | |
mergevec = {} | |
for k in keys: | |
mergevec[k] = (clust[lowestpair[0]].vec.get(k, 0) + | |
clust[lowestpair[1]].vec.get(k,0)) / 2.0 | |
del v1,keys | |
else: | |
# caculate the aver of two clusters vector(dict representation) | |
v1 = clust[lowestpair[0]].vec | |
v1 += clust[lowestpair[1]].vec | |
keys = set(v1) | |
mergevec = list(keys) | |
del v1,keys | |
# create new cluster node | |
newCluster = bicluster(mergevec, left=clust[lowestpair[0]], | |
right =clust[lowestpair[1]], | |
distance=closest,id=currentclusterid) | |
#cluster ids that weren`t in the set are negative | |
currentclusterid = currentclusterid -1 | |
# first del the latter element lowestpair[1] | |
del clust[lowestpair[1]] | |
# then del the first element | |
del clust[lowestpair[0]] | |
clust.append(newCluster) | |
return clust[0] | |
''' | |
@param clust: the trained hierarchical cluster, | |
@param labels: id to labels dictory | |
@param n : depth of node in the tree | |
@param outputfile: write the result to the file | |
''' | |
def writeclust(clust, labels=None, n=0,outputfile=None): | |
for i in range(n): | |
outputfile.write(" ") | |
if clust.id<0 : | |
# dict1[n]=dict1.get(n,[]) | |
# dict1[n].append(str(clust.distance)) | |
# outputfile.write("+ %d %s\n" % (n, str(clust.distance))) | |
if clust.distance >0.1: | |
outputfile.write("+ %d %s\n" % (n, str(clust.distance))) | |
else: | |
outputfile.write("+ %d\n" % (n)) | |
else: | |
if labels==None: | |
outputfile.write(" %d\n" % clust.id) | |
else: | |
# dict1[n]=dict1.get(n,[]) | |
# dict1[n].append( labels[clust.id].encode('utf-8') ) | |
# dict1[n].append( clust.vec ) | |
outputfile.write("- %d %s\n" % (n, labels[clust.id].encode('utf-8'))) | |
if clust.right!=None: | |
writeclust(clust.right, labels=labels, n=n+1, outputfile=outputfile) | |
if clust.left!=None: | |
writeclust(clust.left, labels=labels, n=n+1, outputfile=outputfile) | |
''' | |
@param clust: the trained hierarchical cluster, | |
@param labels: id to labels dictory | |
@param id : the ancestor of the similar lables which | |
the similarity is the first node greater than the cutoff | |
@param dic: global dic to store the info as: | |
{ancestor_id1: [label1,label2,...], | |
ancestor_id2:[label1,label2,...], | |
......} | |
''' | |
def helper(clust, labels,id,dic=None): | |
# note that only the leaf node'id is greater than 0 | |
if clust.id>0: | |
cnt = sum(v for v in clust.vec.values()) | |
label= labels[clust.id].encode('utf-8') | |
dic[id].append(label) | |
# print cnt, label.encode('utf-8') | |
return (cnt, label) | |
else: | |
if clust.right!=None: | |
(rcnt, rlabel) = helper(clust.right, labels, id, dic) | |
else: | |
(rcnt, rlabel) = (0,'') | |
if clust.left!=None: | |
(lcnt, llabel) = helper(clust.left, labels, id, dic) | |
else: | |
(lcnt, llabel) = (0,"") | |
maxpair =(0,'') | |
if rcnt > lcnt: | |
maxpair = (rcnt, rlabel) | |
else: | |
maxpair = (lcnt, llabel) | |
return maxpair | |
''' | |
@param clust: the trained hierarchical cluster, | |
@param labels: id to labels dictory | |
@param id : the ancestor of the similar lables which | |
the similarity is the first node greater than the cutoff | |
@param dic: global dic to store the info as: | |
{ancestor_id1: [label1,label2,...], | |
ancestor_id2:[label1,label2,...], | |
......} | |
''' | |
def toDict(clust, labels=None, dic=None): | |
# note that only the leaf node'id is greater than 0 | |
if clust.id<0 : | |
# find the cnetroid in the tree node | |
# this can be a param in the future | |
if clust.distance >0.1: | |
dic[clust.id] = [] | |
# 找到引用次数最多的标签 | |
(cnt, label) = helper(clust,labels, clust.id, dic) | |
dic[label] = dic[clust.id] | |
# 删除以id为key的例子 | |
del dic[clust.id] | |
else: | |
if clust.right!=None: | |
toDict(clust.right,labels,dic) | |
if clust.left!=None: | |
toDict(clust.left,labels, dic) | |
def printDict(dict,filename): | |
logfile =open(filename,"w+") | |
for k, v in dict.iteritems(): | |
logfile.write(k+":") | |
for item in v: | |
logfile.write("%s "% item) | |
logfile.write("\n") | |
logfile.close() | |
def printDict(dict): | |
for k, v in dict.iteritems(): | |
print k,":", | |
for item in v: | |
print item, | |
def save(cluster): | |
file = open('cluster.pkl','wb') | |
pickle.dump(cluster,file) | |
file.close() | |
def load(): | |
try: | |
file = open('cluster.pkl','rb') | |
cluster = pickle.load(file) | |
file.close() | |
except Exception, e: | |
print e | |
cluster = None | |
return cluster | |
def main(): | |
data, rownames, colnames = get_data() | |
print len(rownames) | |
test("hc821-1",data,rownames,cosine) | |
ldata = {k: v.keys() for k,v in data.iteritems() } | |
test("hc821-2",ldata,rownames,cosine2) | |
''' | |
{ancestor_id1: [label1,label2,...], | |
ancestor_id2:[label1,label2,...], | |
......} | |
''' | |
def test(filename, data, rownames,distance): | |
cluster = hcluster(data,distance) | |
logfile =open(filename,"w+") | |
writeclust(cluster, labels=rownames, n=0,outputfile=logfile) | |
logfile.close() | |
dic={} | |
if __name__ == '__main__': | |
try: | |
cluster = load() | |
except Exception, e: | |
print e,"now clustering.." | |
data, rownames, colnames = get_data() | |
cluster = hcluster(data,cosine) | |
toDict(cluster, rownames, dic) | |
printDict(dic, "result") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment