aluenkinglee/Aggregation.py

## Aggregation.py
'''
Auth : King Lee
Time : 2014-08-20 23:48:30

this program is to find the similar weibo labels using hierarchical
cluster method on lable-user sparse matrix.
'''

from math import sqrt
from os import listdir
from os.path import isfile, join
import pickle

import numpy as np
import json
'''
read the 'users' dictory and get the labels' name and label-user matrix
@return sp: sparse matrix of labels-user
		rownames: labels name
'''
def get_data():
	dir_name = "users"
	label_files = [join(dir_name, f) for f in listdir(dir_name) if isfile( join(dir_name, f) ) and f.find('_') == -1]
	doc_index = {}
	tag_cnt = {}
	users_index = {}
	tag_index = {}
	index_tag = {}
	index_usr = {}

	index  = 0
	for file_name in label_files:
		raw_content = open(file_name, 'r').read()
		try:
			content = json.loads(raw_content.strip())
		except Exception, e:
			label_files.remove(file_name)
			continue

		#  get all the lables and users ,generate the user to index dictory
		#  uid to labels dictory

		if content.has_key('labels') and len(content['labels']) != 0:
			doc_index[content['uid']] = content['labels']
			users_index[content['uid']] = index
			index = index + 1
		if content.has_key('userList'):
			for follower_content in content['userList']:
				if follower_content.has_key('labels') and len(follower_content['labels']) != 0:
					doc_index[follower_content['uid']] = follower_content['labels']
					users_index[follower_content['uid']] = index
					index = index + 1
	# statistic the tag cnt
	for doc in doc_index.keys():
		for tag in doc_index[doc]:
			tag_cnt[tag] = tag_cnt.get(tag, 0) + 1

	# find the tag cnt >= 10 the 10 is a cutoff
	index = 0
	for tag , cnt in tag_cnt.iteritems():
		if cnt >10:
			index_tag[index] = tag
			tag_index[tag] = index
			index += 1

	# get the tag-user matrix to cluster by Hierarchical clustering
	# sparse matrix
	sp = {}
	for uid in users_index.keys():
		uindex = users_index[uid]
		for tag in doc_index[uid]:
			if(tag_index.has_key(tag)):
				tindex = tag_index[tag]
				if tindex not in sp.keys():
					sp[tindex] = {}
				sp[tindex][uindex] = sp[tindex].get(uindex,0) +1

	colnames = [index_usr[i] for i in range(len(index_usr))]
	rownames = [index_tag[i] for i in range(len(index_tag))]
	# print len(sp),len(rownames),len(users_index)
	return sp,rownames, colnames

'''
@param v1: dict format, {label:{usrid:cnt,....,}}
@param v2: dict format, {label:{usrid:cnt,....,}}
@return cos similarity
'''
def cosine(v1, v2):
    # for sparse vector
    # sums of the squares
    sum1sq = sum([pow(v, 2) for v in v1.values()])
    sum2sq = sum([pow(v, 2) for v in v2.values()])
    l1 = [k for k in v1.keys()]
    l1 += [k for k in v2.keys()]
    keys = set(l1)
    # sums of products
    psum = sum(v1.get(k, 0) * v2.get(k, 0) for k in keys)
    c = psum/(sqrt(sum1sq) * sqrt(sum2sq))
    return c

'''
@param v1: list format, [usr1,usr2,...]
@param v2: list format  [usr1,usr2,...]
@return cos similarity
'''
def cosine2(v1, v2):
    # for list
    # sums of the squares
    sum1sq = sum([pow(1, 2) for v in range(len(v1))])
    sum2sq = sum([pow(1, 2) for v in range(len(v2))])
    l1 = [k for k in v1]
    l2 = [k for k in v2]
    # sums of products
    psum = sum(1 for k in range(len(set(l1)&set(l2))))
    c = psum/(sqrt(sum1sq) * sqrt(sum2sq))
    return c

'''
binary cluster node using in the hierarchical clustering method
'''
class bicluster:
	def __init__(self, vec, left=None, right=None, distance=0.0, id=None):
		self.left = left
		self.right = right
		# vector is a dict
		self.vec = vec
		self.id = id
		self.distance = distance
		pass

'''
hierarchical clustering method
@param rows: label-user matrix,
@param distance: the similarity function,default is cosine

@return cluster
'''
def hcluster(rows, distance=cosine):
	distances = {}
	currentclusterid = -1

	# init the cluster by row id
	clust = [bicluster(rows[i], id =i) for i  in rows.keys()]


	while(len(clust) >1):
		print "clustering....", len(clust)
		lowestpair = (0,1)
		closest = distance(clust[0].vec , clust[1].vec)

		# loop for the smallest distance
		for i in range(len(clust)):
			for j in range(i+1,len(clust)):
				if (clust[i].id , clust[j].id) not in distances:
					distances[(clust[i].id, clust[j].id)] = distance(clust[i].vec, clust[j].vec)

				d = distances[(clust[i].id, clust[j].id)]
				if distance == pearson:
					if d < closest:
						closest = d
						lowestpair = (i , j)
				else:
					if d > closest:
						closest = d
						lowestpair = (i , j)

		if distance != cosine2:
			# caculate the aver of two clusters vector(dict representation)
			v1 = [k for k in clust[lowestpair[0]].vec.keys()]
			v1 += [k for k in clust[lowestpair[1]].vec.keys()]
			keys = set(v1)
			mergevec = {}
			for k in keys:
				mergevec[k] = (clust[lowestpair[0]].vec.get(k, 0) +
					clust[lowestpair[1]].vec.get(k,0)) / 2.0
			del v1,keys
		else:
			# caculate the aver of two clusters vector(dict representation)

			v1 = clust[lowestpair[0]].vec
			v1 += clust[lowestpair[1]].vec
			keys = set(v1)
			mergevec = list(keys)
			del v1,keys

		# create new cluster node
		newCluster = bicluster(mergevec, left=clust[lowestpair[0]],
			right =clust[lowestpair[1]],
			distance=closest,id=currentclusterid)

		#cluster ids that weren`t in the set are negative
		currentclusterid = currentclusterid -1
		# first del the latter element lowestpair[1]
		del clust[lowestpair[1]]
		# then del the first element
		del clust[lowestpair[0]]

		clust.append(newCluster)
	return clust[0]

'''
@param clust: the trained hierarchical cluster,
@param labels: id to labels dictory
@param n : depth of node in the tree
@param outputfile: write the result to the file
'''
def writeclust(clust, labels=None, n=0,outputfile=None):
	for i in range(n):
		outputfile.write(" ")
	if clust.id<0 :
		# dict1[n]=dict1.get(n,[])
		# dict1[n].append(str(clust.distance))
		# outputfile.write("+ %d %s\n" % (n, str(clust.distance)))
		if clust.distance >0.1:
			outputfile.write("+ %d %s\n" % (n, str(clust.distance)))
		else:
			outputfile.write("+ %d\n" % (n))
	else:
		if labels==None:
			outputfile.write(" %d\n" % clust.id)
		else:
			# dict1[n]=dict1.get(n,[])
			# dict1[n].append(  labels[clust.id].encode('utf-8') )
			# dict1[n].append( clust.vec )
			outputfile.write("- %d %s\n" % (n, labels[clust.id].encode('utf-8')))
	if clust.right!=None:
		writeclust(clust.right, labels=labels, n=n+1, outputfile=outputfile)
	if clust.left!=None:
		writeclust(clust.left, labels=labels, n=n+1, outputfile=outputfile)

'''
@param clust: the trained hierarchical cluster,
@param labels: id to labels dictory
@param id : the ancestor of the similar lables which
	the similarity is the first node greater than the cutoff
@param dic: global dic to store the info as:
	{ancestor_id1: [label1,label2,...],
	ancestor_id2:[label1,label2,...],
	......}
'''
def helper(clust, labels,id,dic=None):
	# note that only the leaf node'id  is greater than 0
	if clust.id>0:
		cnt = sum(v for v in clust.vec.values())
		label= labels[clust.id].encode('utf-8')
		dic[id].append(label)
		# print cnt, label.encode('utf-8')
		return (cnt, label)
	else:
		if clust.right!=None:
			(rcnt, rlabel) = helper(clust.right, labels, id, dic)
		else:
			(rcnt, rlabel) = (0,'')
		if clust.left!=None:
			(lcnt, llabel) = helper(clust.left, labels, id, dic)
		else:
			(lcnt, llabel) = (0,"")
		maxpair =(0,'')
		if rcnt > lcnt:
			maxpair = (rcnt, rlabel)
		else:
			maxpair = (lcnt, llabel)
		return maxpair

'''
@param clust: the trained hierarchical cluster,
@param labels: id to labels dictory
@param id : the ancestor of the similar lables which
	the similarity is the first node greater than the cutoff
@param dic: global dic to store the info as:
	{ancestor_id1: [label1,label2,...],
	ancestor_id2:[label1,label2,...],
	......}
'''
def toDict(clust, labels=None, dic=None):
	# note that only the leaf node'id  is greater than 0
	if clust.id<0 :
		#  find the cnetroid in the tree node
		#  this can be a param in the future
		if clust.distance >0.1:
			dic[clust.id] = []
			# 找到引用次数最多的标签
			(cnt, label) = helper(clust,labels, clust.id, dic)
			dic[label] = dic[clust.id]
			# 删除以id为key的例子
			del dic[clust.id]
		else:
			if clust.right!=None:
				toDict(clust.right,labels,dic)
			if clust.left!=None:
				toDict(clust.left,labels, dic)

def printDict(dict,filename):
	logfile =open(filename,"w+")
	for k, v in dict.iteritems():
		logfile.write(k+":")
		for item in v:
			logfile.write("%s "% item)
		logfile.write("\n")
	logfile.close()

def printDict(dict):
	for k, v in dict.iteritems():
		print k,":",
		for item in v:
			print item,
		print


def save(cluster):
	file = open('cluster.pkl','wb')
	pickle.dump(cluster,file)
	file.close()

def load():
	try:
		file = open('cluster.pkl','rb')
		cluster = pickle.load(file)
		file.close()
	except Exception, e:
		print e
		cluster = None
	return cluster

def main():
	data, rownames, colnames = get_data()
	print len(rownames)
	test("hc821-1",data,rownames,cosine)
	ldata = {k: v.keys() for k,v in data.iteritems() }
	test("hc821-2",ldata,rownames,cosine2)

'''
{ancestor_id1: [label1,label2,...],
	ancestor_id2:[label1,label2,...],
	......}
'''


def test(filename, data, rownames,distance):
	cluster = hcluster(data,distance)
	logfile =open(filename,"w+")
	writeclust(cluster, labels=rownames, n=0,outputfile=logfile)
	logfile.close()

dic={}
if __name__ == '__main__':
	try:
		cluster = load()
	except Exception, e:
		print e,"now clustering.."
		data, rownames, colnames = get_data()
		cluster = hcluster(data,cosine)

	toDict(cluster, rownames, dic)
	printDict(dic, "result")
	'''
	Auth : King Lee
	Time : 2014-08-20 23:48:30

	this program is to find the similar weibo labels using hierarchical
	cluster method on lable-user sparse matrix.
	'''

	from math import sqrt
	from os import listdir
	from os.path import isfile, join
	import pickle

	import numpy as np
	import json
	'''
	read the 'users' dictory and get the labels' name and label-user matrix
	@return sp: sparse matrix of labels-user
	rownames: labels name
	'''
	def get_data():
	dir_name = "users"
	label_files = [join(dir_name, f) for f in listdir(dir_name) if isfile( join(dir_name, f) ) and f.find('_') == -1]
	doc_index = {}
	tag_cnt = {}
	users_index = {}
	tag_index = {}
	index_tag = {}
	index_usr = {}

	index = 0
	for file_name in label_files:
	raw_content = open(file_name, 'r').read()
	try:
	content = json.loads(raw_content.strip())
	except Exception, e:
	label_files.remove(file_name)
	continue

	# get all the lables and users ,generate the user to index dictory
	# uid to labels dictory

	if content.has_key('labels') and len(content['labels']) != 0:
	doc_index[content['uid']] = content['labels']
	users_index[content['uid']] = index
	index = index + 1
	if content.has_key('userList'):
	for follower_content in content['userList']:
	if follower_content.has_key('labels') and len(follower_content['labels']) != 0:
	doc_index[follower_content['uid']] = follower_content['labels']
	users_index[follower_content['uid']] = index
	index = index + 1
	# statistic the tag cnt
	for doc in doc_index.keys():
	for tag in doc_index[doc]:
	tag_cnt[tag] = tag_cnt.get(tag, 0) + 1

	# find the tag cnt >= 10 the 10 is a cutoff
	index = 0
	for tag , cnt in tag_cnt.iteritems():
	if cnt >10:
	index_tag[index] = tag
	tag_index[tag] = index
	index += 1

	# get the tag-user matrix to cluster by Hierarchical clustering
	# sparse matrix
	sp = {}
	for uid in users_index.keys():
	uindex = users_index[uid]
	for tag in doc_index[uid]:
	if(tag_index.has_key(tag)):
	tindex = tag_index[tag]
	if tindex not in sp.keys():
	sp[tindex] = {}
	sp[tindex][uindex] = sp[tindex].get(uindex,0) +1

	colnames = [index_usr[i] for i in range(len(index_usr))]
	rownames = [index_tag[i] for i in range(len(index_tag))]
	# print len(sp),len(rownames),len(users_index)
	return sp,rownames, colnames

	'''
	@param v1: dict format, {label:{usrid:cnt,....,}}
	@param v2: dict format, {label:{usrid:cnt,....,}}
	@return cos similarity
	'''
	def cosine(v1, v2):
	# for sparse vector
	# sums of the squares
	sum1sq = sum([pow(v, 2) for v in v1.values()])
	sum2sq = sum([pow(v, 2) for v in v2.values()])
	l1 = [k for k in v1.keys()]
	l1 += [k for k in v2.keys()]
	keys = set(l1)
	# sums of products
	psum = sum(v1.get(k, 0) * v2.get(k, 0) for k in keys)
	c = psum/(sqrt(sum1sq) * sqrt(sum2sq))
	return c

	'''
	@param v1: list format, [usr1,usr2,...]
	@param v2: list format [usr1,usr2,...]
	@return cos similarity
	'''
	def cosine2(v1, v2):
	# for list
	# sums of the squares
	sum1sq = sum([pow(1, 2) for v in range(len(v1))])
	sum2sq = sum([pow(1, 2) for v in range(len(v2))])
	l1 = [k for k in v1]
	l2 = [k for k in v2]
	# sums of products
	psum = sum(1 for k in range(len(set(l1)&set(l2))))
	c = psum/(sqrt(sum1sq) * sqrt(sum2sq))
	return c

	'''
	binary cluster node using in the hierarchical clustering method
	'''
	class bicluster:
	def __init__(self, vec, left=None, right=None, distance=0.0, id=None):
	self.left = left
	self.right = right
	# vector is a dict
	self.vec = vec
	self.id = id
	self.distance = distance
	pass

	'''
	hierarchical clustering method
	@param rows: label-user matrix,
	@param distance: the similarity function,default is cosine

	@return cluster
	'''
	def hcluster(rows, distance=cosine):
	distances = {}
	currentclusterid = -1

	# init the cluster by row id
	clust = [bicluster(rows[i], id =i) for i in rows.keys()]


	while(len(clust) >1):
	print "clustering....", len(clust)
	lowestpair = (0,1)
	closest = distance(clust[0].vec , clust[1].vec)

	# loop for the smallest distance
	for i in range(len(clust)):
	for j in range(i+1,len(clust)):
	if (clust[i].id , clust[j].id) not in distances:
	distances[(clust[i].id, clust[j].id)] = distance(clust[i].vec, clust[j].vec)

	d = distances[(clust[i].id, clust[j].id)]
	if distance == pearson:
	if d < closest:
	closest = d
	lowestpair = (i , j)
	else:
	if d > closest:
	closest = d
	lowestpair = (i , j)

	if distance != cosine2:
	# caculate the aver of two clusters vector(dict representation)
	v1 = [k for k in clust[lowestpair[0]].vec.keys()]
	v1 += [k for k in clust[lowestpair[1]].vec.keys()]
	keys = set(v1)
	mergevec = {}
	for k in keys:
	mergevec[k] = (clust[lowestpair[0]].vec.get(k, 0) +
	clust[lowestpair[1]].vec.get(k,0)) / 2.0
	del v1,keys
	else:
	# caculate the aver of two clusters vector(dict representation)

	v1 = clust[lowestpair[0]].vec
	v1 += clust[lowestpair[1]].vec
	keys = set(v1)
	mergevec = list(keys)
	del v1,keys

	# create new cluster node
	newCluster = bicluster(mergevec, left=clust[lowestpair[0]],
	right =clust[lowestpair[1]],
	distance=closest,id=currentclusterid)

	#cluster ids that weren`t in the set are negative
	currentclusterid = currentclusterid -1
	# first del the latter element lowestpair[1]
	del clust[lowestpair[1]]
	# then del the first element
	del clust[lowestpair[0]]

	clust.append(newCluster)
	return clust[0]

	'''
	@param clust: the trained hierarchical cluster,
	@param labels: id to labels dictory
	@param n : depth of node in the tree
	@param outputfile: write the result to the file
	'''
	def writeclust(clust, labels=None, n=0,outputfile=None):
	for i in range(n):
	outputfile.write(" ")
	if clust.id<0 :
	# dict1[n]=dict1.get(n,[])
	# dict1[n].append(str(clust.distance))
	# outputfile.write("+ %d %s\n" % (n, str(clust.distance)))
	if clust.distance >0.1:
	outputfile.write("+ %d %s\n" % (n, str(clust.distance)))
	else:
	outputfile.write("+ %d\n" % (n))
	else:
	if labels==None:
	outputfile.write(" %d\n" % clust.id)
	else:
	# dict1[n]=dict1.get(n,[])
	# dict1[n].append( labels[clust.id].encode('utf-8') )
	# dict1[n].append( clust.vec )
	outputfile.write("- %d %s\n" % (n, labels[clust.id].encode('utf-8')))
	if clust.right!=None:
	writeclust(clust.right, labels=labels, n=n+1, outputfile=outputfile)
	if clust.left!=None:
	writeclust(clust.left, labels=labels, n=n+1, outputfile=outputfile)

	'''
	@param clust: the trained hierarchical cluster,
	@param labels: id to labels dictory
	@param id : the ancestor of the similar lables which
	the similarity is the first node greater than the cutoff
	@param dic: global dic to store the info as:
	{ancestor_id1: [label1,label2,...],
	ancestor_id2:[label1,label2,...],
	......}
	'''
	def helper(clust, labels,id,dic=None):
	# note that only the leaf node'id is greater than 0
	if clust.id>0:
	cnt = sum(v for v in clust.vec.values())
	label= labels[clust.id].encode('utf-8')
	dic[id].append(label)
	# print cnt, label.encode('utf-8')
	return (cnt, label)
	else:
	if clust.right!=None:
	(rcnt, rlabel) = helper(clust.right, labels, id, dic)
	else:
	(rcnt, rlabel) = (0,'')
	if clust.left!=None:
	(lcnt, llabel) = helper(clust.left, labels, id, dic)
	else:
	(lcnt, llabel) = (0,"")
	maxpair =(0,'')
	if rcnt > lcnt:
	maxpair = (rcnt, rlabel)
	else:
	maxpair = (lcnt, llabel)
	return maxpair

	'''
	@param clust: the trained hierarchical cluster,
	@param labels: id to labels dictory
	@param id : the ancestor of the similar lables which
	the similarity is the first node greater than the cutoff
	@param dic: global dic to store the info as:
	{ancestor_id1: [label1,label2,...],
	ancestor_id2:[label1,label2,...],
	......}
	'''
	def toDict(clust, labels=None, dic=None):
	# note that only the leaf node'id is greater than 0
	if clust.id<0 :
	# find the cnetroid in the tree node
	# this can be a param in the future
	if clust.distance >0.1:
	dic[clust.id] = []
	# 找到引用次数最多的标签
	(cnt, label) = helper(clust,labels, clust.id, dic)
	dic[label] = dic[clust.id]
	# 删除以id为key的例子
	del dic[clust.id]
	else:
	if clust.right!=None:
	toDict(clust.right,labels,dic)
	if clust.left!=None:
	toDict(clust.left,labels, dic)

	def printDict(dict,filename):
	logfile =open(filename,"w+")
	for k, v in dict.iteritems():
	logfile.write(k+":")
	for item in v:
	logfile.write("%s "% item)
	logfile.write("\n")
	logfile.close()

	def printDict(dict):
	for k, v in dict.iteritems():
	print k,":",
	for item in v:
	print item,
	print



	def save(cluster):
	file = open('cluster.pkl','wb')
	pickle.dump(cluster,file)
	file.close()

	def load():
	try:
	file = open('cluster.pkl','rb')
	cluster = pickle.load(file)
	file.close()
	except Exception, e:
	print e
	cluster = None
	return cluster

	def main():
	data, rownames, colnames = get_data()
	print len(rownames)
	test("hc821-1",data,rownames,cosine)
	ldata = {k: v.keys() for k,v in data.iteritems() }
	test("hc821-2",ldata,rownames,cosine2)

	'''
	{ancestor_id1: [label1,label2,...],
	ancestor_id2:[label1,label2,...],
	......}
	'''


	def test(filename, data, rownames,distance):
	cluster = hcluster(data,distance)
	logfile =open(filename,"w+")
	writeclust(cluster, labels=rownames, n=0,outputfile=logfile)
	logfile.close()

	dic={}
	if __name__ == '__main__':
	try:
	cluster = load()
	except Exception, e:
	print e,"now clustering.."
	data, rownames, colnames = get_data()
	cluster = hcluster(data,cosine)

	toDict(cluster, rownames, dic)
	printDict(dic, "result")