Created
June 30, 2016 00:24
-
-
Save xuwangyin/fea0506e260cfd0d1a1bc48d877e6c5b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
from sklearn.cluster import AffinityPropagation | |
from sklearn import metrics | |
from sklearn.datasets.samples_generator import make_blobs | |
import scipy | |
import scipy.stats | |
import numpy as np | |
from collections import Counter | |
# Kullback–Leibler divergence | |
# https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence | |
# http://scipy.github.io/devdocs/generated/scipy.stats.entropy.html | |
def kl(p, q): | |
# compute common elements | |
set_p = set(p) | |
set_q = set(q) | |
intersection = set_p.intersection(set_q) | |
# similarity is 0 when there are no common elements | |
if len(intersection) == 0: | |
return 0 | |
# ratio of common elements | |
area_ratio = float(len(intersection)**2) / (len(set_p)*len(set_q)) | |
# if area_ratio < .5: | |
# return 0 | |
# count occurences of common elements | |
intersection_p = Counter(sorted([e for e in p if e in intersection])).values() | |
intersection_q = Counter(sorted([e for e in q if e in intersection])).values() | |
# calculate probability distribution | |
sum_p = float(sum(intersection_p)) | |
sum_q = float(sum(intersection_q)) | |
intersection_p = [e/sum_p for e in intersection_p] | |
intersection_q = [e/sum_q for e in intersection_q] | |
# common elements similarity | |
intersection_similarity = 1. - scipy.stats.entropy(intersection_p, intersection_q) | |
# similarity | |
return intersection_similarity * area_ratio | |
# parse request data file, requests[i] will be the list of requested items by user i | |
lines = open('22.csv').readlines() | |
d = np.ndarray((len(lines),2), np.int32) | |
for i, line in enumerate(lines): | |
d[i] = map(int, line.strip().split(',')) | |
requests = {} | |
for i in range(d.shape[0]): | |
key = str(d[i,0]) | |
if not key in requests: | |
requests[key] = [] | |
requests[key].append(d[i,1]) | |
# for the sake of computing affinity matrix, map user ids to consecutive integer indices | |
userid2index = {} | |
index2userid = {} | |
for i, userid in enumerate(requests.keys()): | |
userid2index[userid] = i | |
index2userid[i] = userid | |
# compute affinity matrix | |
# affinity matrix element m[i,j] is the similarity between user i and user j | |
N = len(requests.keys()) | |
affinity_matrix = np.ndarray((N, N), dtype=np.float64) | |
for i, key in enumerate(requests.keys()): | |
if i % 50 == 0: | |
print i | |
for key2 in requests.keys(): | |
if key != key2: | |
affinity_matrix[userid2index[key], userid2index[key2]] = kl(requests[key], requests[key2]) | |
for i in range(N): | |
affinity_matrix[i,i] = 1 | |
#http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AffinityPropagation.html | |
#https://en.wikipedia.org/wiki/Affinity_propagation | |
af = AffinityPropagation(affinity='precomputed').fit(affinity_matrix) | |
# cluster disance stats | |
n = [] | |
for i in range(len(af.cluster_centers_indices_)): | |
print("cluster %d, center element %d, number of elements %d, average distance to center: %f" % | |
(i, af.cluster_centers_indices_[i], (af.labels_ == i).sum(), | |
affinity_matrix[af.cluster_centers_indices_[i], af.labels_ == i].mean())) | |
#print("distances to center: " + str(affinity_matrix[af.cluster_centers_indices_[i], af.labels_ == i])) | |
n.append((af.labels_ == i).sum()) | |
# clusters[i] is the list of elements of cluster i | |
# centers[i] is the center element of cluster i | |
clusters = [] | |
centers = [] | |
for i in range(len(af.cluster_centers_indices_)): | |
clusters.append(np.flatnonzero(af.labels_ == i).tolist()) | |
centers.append(af.cluster_centers_indices_[i]) | |
# userid_clusters[i] is the list of userids in cluster i | |
# userid_centers[i] is the center userid in cluster i | |
userid_clusters = [] | |
userid_centers = [] | |
for i in range(len(clusters)): | |
userid_clusters.append([index2userid[e] for e in clusters[i]]) | |
userid_centers.append(index2userid[centers[i]]) | |
print(userid_clusters) | |
print(userid_centers) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment