Skip to content

Instantly share code, notes, and snippets.

@dfhe2004
Created May 28, 2012 09:21
Show Gist options
  • Save dfhe2004/2818113 to your computer and use it in GitHub Desktop.
Save dfhe2004/2818113 to your computer and use it in GitHub Desktop.
Spectral clustering
import numpy as np
from scipy.linalg import eig
from scipy.cluster.vq import kmeans2
from scipy.sparse.linalg import eigen
from scipy.spatial.kdtree import KDTree
from pylab import *
# Three multivariate Gaussians with means and cov listed below
MU = [[5,3], [0,0], [-2,3], [-2,-4]]
COV = [[[4,2],[0,1]], [[1,0],[0,1]], [[1,2],[2,1]], [[0,1],[2,1]],]
ColorMask = ['k','b','g', 'r']
# Create some sample data
def _gaussian(mu, cov, pts):
return np.random.multivariate_normal(mu,cov,pts)
def test_data(c,num):
A = [_gaussian(mu,cov,num) for mu,cov in zip(MU[:c],COV[:c])]
return np.concatenate(A)
def radial_kernel(c=1.5):
def inner(a, b):
d = a - b
return np.exp(-1*(np.dot(d, d.conj())) / c)
return inner
def mutual_knn(points, n=10, distance=radial_kernel()):
knn = {}
kt = KDTree(points)
for i, point in enumerate(points):
# cannot use euclidean distance directly
for neighbour in kt.query(point, n + 1)[1]:
if i >= neighbour: continue
knn.setdefault(i, []).append(
(distance(point, points[neighbour]), neighbour))
return knn
def get_distance_matrix(N,knn):
W = np.zeros((N, N))
for point, nearest_neighbours in knn.iteritems():
for distance, neighbour in nearest_neighbours:
W[point][neighbour] = distance
W[neighbour][point] = distance
return W
def cluster_points(L, c=3):
# sparse eigen is a little bit faster than eig
#evals, evcts = eigen(L, k=15, which="SM")
evals, evcts = eig(L)
evals, evcts = evals.real, evcts.real
edict = dict(zip(evals, evcts.transpose()))
evals = sorted(edict.keys())
# second and third smallest eigenvalue + vector
Y = np.array([edict[k] for k in evals[1:c]]).transpose()
res, idx = kmeans2(Y, c, minit='random')
return evals[:15], Y, rename_clusters(idx)
def rename_clusters(idx):
# so that first cluster has index 0
num = -1
seen = {}
newidx = []
for id in idx:
if id not in seen:
num += 1
seen[id] = num
newidx.append(seen[id])
return np.array(newidx)
c, num = 4, 100
D = test_data(c,num)
X,Y = map(array, zip(*D))
subplot(211)
for i in xrange(c): #enumerate(idx):
scatter(X[i*num:i*num+num],Y[i*num:i*num+num],color=ColorMask[i])
knn_points = mutual_knn(D)
W = get_distance_matrix(len(D),knn_points)
G = np.diag([sum(Wi) for Wi in W])
# unnormalized graph Laplacian
L = G - W
evals, _Y, labels = cluster_points(L,c)
# Plot the points and color according to the cluster
subplot(212)
for i in xrange(c): #enumerate(idx):
idx = (labels==i)
if not idx.any(): continue
scatter(X[idx],Y[idx],color=ColorMask[i])
show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment