Created
May 28, 2012 09:21
-
-
Save dfhe2004/2818113 to your computer and use it in GitHub Desktop.
Spectral clustering
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from scipy.linalg import eig | |
from scipy.cluster.vq import kmeans2 | |
from scipy.sparse.linalg import eigen | |
from scipy.spatial.kdtree import KDTree | |
from pylab import * | |
# Three multivariate Gaussians with means and cov listed below | |
MU = [[5,3], [0,0], [-2,3], [-2,-4]] | |
COV = [[[4,2],[0,1]], [[1,0],[0,1]], [[1,2],[2,1]], [[0,1],[2,1]],] | |
ColorMask = ['k','b','g', 'r'] | |
# Create some sample data | |
def _gaussian(mu, cov, pts): | |
return np.random.multivariate_normal(mu,cov,pts) | |
def test_data(c,num): | |
A = [_gaussian(mu,cov,num) for mu,cov in zip(MU[:c],COV[:c])] | |
return np.concatenate(A) | |
def radial_kernel(c=1.5): | |
def inner(a, b): | |
d = a - b | |
return np.exp(-1*(np.dot(d, d.conj())) / c) | |
return inner | |
def mutual_knn(points, n=10, distance=radial_kernel()): | |
knn = {} | |
kt = KDTree(points) | |
for i, point in enumerate(points): | |
# cannot use euclidean distance directly | |
for neighbour in kt.query(point, n + 1)[1]: | |
if i >= neighbour: continue | |
knn.setdefault(i, []).append( | |
(distance(point, points[neighbour]), neighbour)) | |
return knn | |
def get_distance_matrix(N,knn): | |
W = np.zeros((N, N)) | |
for point, nearest_neighbours in knn.iteritems(): | |
for distance, neighbour in nearest_neighbours: | |
W[point][neighbour] = distance | |
W[neighbour][point] = distance | |
return W | |
def cluster_points(L, c=3): | |
# sparse eigen is a little bit faster than eig | |
#evals, evcts = eigen(L, k=15, which="SM") | |
evals, evcts = eig(L) | |
evals, evcts = evals.real, evcts.real | |
edict = dict(zip(evals, evcts.transpose())) | |
evals = sorted(edict.keys()) | |
# second and third smallest eigenvalue + vector | |
Y = np.array([edict[k] for k in evals[1:c]]).transpose() | |
res, idx = kmeans2(Y, c, minit='random') | |
return evals[:15], Y, rename_clusters(idx) | |
def rename_clusters(idx): | |
# so that first cluster has index 0 | |
num = -1 | |
seen = {} | |
newidx = [] | |
for id in idx: | |
if id not in seen: | |
num += 1 | |
seen[id] = num | |
newidx.append(seen[id]) | |
return np.array(newidx) | |
c, num = 4, 100 | |
D = test_data(c,num) | |
X,Y = map(array, zip(*D)) | |
subplot(211) | |
for i in xrange(c): #enumerate(idx): | |
scatter(X[i*num:i*num+num],Y[i*num:i*num+num],color=ColorMask[i]) | |
knn_points = mutual_knn(D) | |
W = get_distance_matrix(len(D),knn_points) | |
G = np.diag([sum(Wi) for Wi in W]) | |
# unnormalized graph Laplacian | |
L = G - W | |
evals, _Y, labels = cluster_points(L,c) | |
# Plot the points and color according to the cluster | |
subplot(212) | |
for i in xrange(c): #enumerate(idx): | |
idx = (labels==i) | |
if not idx.any(): continue | |
scatter(X[idx],Y[idx],color=ColorMask[i]) | |
show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment