Skip to content

Instantly share code, notes, and snippets.

@xuwangyin
Created June 30, 2016 00:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xuwangyin/fea0506e260cfd0d1a1bc48d877e6c5b to your computer and use it in GitHub Desktop.
Save xuwangyin/fea0506e260cfd0d1a1bc48d877e6c5b to your computer and use it in GitHub Desktop.
# coding: utf-8
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
import scipy
import scipy.stats
import numpy as np
from collections import Counter
# Kullback–Leibler divergence
# https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
# http://scipy.github.io/devdocs/generated/scipy.stats.entropy.html
def kl(p, q):
# compute common elements
set_p = set(p)
set_q = set(q)
intersection = set_p.intersection(set_q)
# similarity is 0 when there are no common elements
if len(intersection) == 0:
return 0
# ratio of common elements
area_ratio = float(len(intersection)**2) / (len(set_p)*len(set_q))
# if area_ratio < .5:
# return 0
# count occurences of common elements
intersection_p = Counter(sorted([e for e in p if e in intersection])).values()
intersection_q = Counter(sorted([e for e in q if e in intersection])).values()
# calculate probability distribution
sum_p = float(sum(intersection_p))
sum_q = float(sum(intersection_q))
intersection_p = [e/sum_p for e in intersection_p]
intersection_q = [e/sum_q for e in intersection_q]
# common elements similarity
intersection_similarity = 1. - scipy.stats.entropy(intersection_p, intersection_q)
# similarity
return intersection_similarity * area_ratio
# parse request data file, requests[i] will be the list of requested items by user i
lines = open('22.csv').readlines()
d = np.ndarray((len(lines),2), np.int32)
for i, line in enumerate(lines):
d[i] = map(int, line.strip().split(','))
requests = {}
for i in range(d.shape[0]):
key = str(d[i,0])
if not key in requests:
requests[key] = []
requests[key].append(d[i,1])
# for the sake of computing affinity matrix, map user ids to consecutive integer indices
userid2index = {}
index2userid = {}
for i, userid in enumerate(requests.keys()):
userid2index[userid] = i
index2userid[i] = userid
# compute affinity matrix
# affinity matrix element m[i,j] is the similarity between user i and user j
N = len(requests.keys())
affinity_matrix = np.ndarray((N, N), dtype=np.float64)
for i, key in enumerate(requests.keys()):
if i % 50 == 0:
print i
for key2 in requests.keys():
if key != key2:
affinity_matrix[userid2index[key], userid2index[key2]] = kl(requests[key], requests[key2])
for i in range(N):
affinity_matrix[i,i] = 1
#http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AffinityPropagation.html
#https://en.wikipedia.org/wiki/Affinity_propagation
af = AffinityPropagation(affinity='precomputed').fit(affinity_matrix)
# cluster disance stats
n = []
for i in range(len(af.cluster_centers_indices_)):
print("cluster %d, center element %d, number of elements %d, average distance to center: %f" %
(i, af.cluster_centers_indices_[i], (af.labels_ == i).sum(),
affinity_matrix[af.cluster_centers_indices_[i], af.labels_ == i].mean()))
#print("distances to center: " + str(affinity_matrix[af.cluster_centers_indices_[i], af.labels_ == i]))
n.append((af.labels_ == i).sum())
# clusters[i] is the list of elements of cluster i
# centers[i] is the center element of cluster i
clusters = []
centers = []
for i in range(len(af.cluster_centers_indices_)):
clusters.append(np.flatnonzero(af.labels_ == i).tolist())
centers.append(af.cluster_centers_indices_[i])
# userid_clusters[i] is the list of userids in cluster i
# userid_centers[i] is the center userid in cluster i
userid_clusters = []
userid_centers = []
for i in range(len(clusters)):
userid_clusters.append([index2userid[e] for e in clusters[i]])
userid_centers.append(index2userid[centers[i]])
print(userid_clusters)
print(userid_centers)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment