dfhe2004/gist:2818113

## gistfile1.py
import numpy as np

from scipy.linalg import eig
from scipy.cluster.vq import kmeans2
from scipy.sparse.linalg import eigen
from scipy.spatial.kdtree import KDTree

from pylab import *

# Three multivariate Gaussians with means and cov listed below
MU  = [[5,3], [0,0], [-2,3],  [-2,-4]]
COV = [[[4,2],[0,1]], [[1,0],[0,1]], [[1,2],[2,1]], [[0,1],[2,1]],]
ColorMask = ['k','b','g', 'r']


# Create some sample data
def _gaussian(mu, cov, pts):
    return np.random.multivariate_normal(mu,cov,pts)


def test_data(c,num):
    A = [_gaussian(mu,cov,num) for mu,cov in zip(MU[:c],COV[:c])]
    return np.concatenate(A)


def radial_kernel(c=1.5):
    def inner(a, b):
        d = a - b
        return np.exp(-1*(np.dot(d, d.conj())) / c)
    return inner

def mutual_knn(points, n=10, distance=radial_kernel()):
    knn = {}
    kt = KDTree(points)
    for i, point in enumerate(points):
        # cannot use euclidean distance directly
        for neighbour in kt.query(point, n + 1)[1]:
            if i >= neighbour: continue
            knn.setdefault(i, []).append(
                    (distance(point, points[neighbour]), neighbour))
    return knn

def get_distance_matrix(N,knn):
    W = np.zeros((N, N))
    for point, nearest_neighbours in knn.iteritems():
        for distance, neighbour in nearest_neighbours:
            W[point][neighbour] = distance
            W[neighbour][point] = distance
    return W


def cluster_points(L, c=3):
    # sparse eigen is a little bit faster than eig
    #evals, evcts = eigen(L, k=15, which="SM")
    evals, evcts = eig(L)
    evals, evcts = evals.real, evcts.real
    edict = dict(zip(evals, evcts.transpose()))
    evals = sorted(edict.keys())
    # second and third smallest eigenvalue + vector
    Y = np.array([edict[k] for k in evals[1:c]]).transpose()
    res, idx = kmeans2(Y, c, minit='random')
    return evals[:15], Y, rename_clusters(idx)

def rename_clusters(idx):
    # so that first cluster has index 0
    num = -1
    seen = {}
    newidx = []
    for id in idx:
        if id not in seen:
            num += 1
            seen[id] = num
        newidx.append(seen[id])
    return np.array(newidx)


c, num = 4, 100

D = test_data(c,num)
X,Y = map(array, zip(*D))

subplot(211)
for i in xrange(c): #enumerate(idx):
    scatter(X[i*num:i*num+num],Y[i*num:i*num+num],color=ColorMask[i])


knn_points = mutual_knn(D)
W = get_distance_matrix(len(D),knn_points)
G = np.diag([sum(Wi) for Wi in W])

# unnormalized graph Laplacian
L = G - W
evals, _Y, labels = cluster_points(L,c)

# Plot the points and color according to the cluster
subplot(212)
for i in xrange(c): #enumerate(idx):
    idx = (labels==i)
    if not idx.any(): continue
    scatter(X[idx],Y[idx],color=ColorMask[i])
show()
	import numpy as np

	from scipy.linalg import eig
	from scipy.cluster.vq import kmeans2
	from scipy.sparse.linalg import eigen
	from scipy.spatial.kdtree import KDTree

	from pylab import *

	# Three multivariate Gaussians with means and cov listed below
	MU = [[5,3], [0,0], [-2,3], [-2,-4]]
	COV = [[[4,2],[0,1]], [[1,0],[0,1]], [[1,2],[2,1]], [[0,1],[2,1]],]
	ColorMask = ['k','b','g', 'r']


	# Create some sample data
	def _gaussian(mu, cov, pts):
	return np.random.multivariate_normal(mu,cov,pts)


	def test_data(c,num):
	A = [_gaussian(mu,cov,num) for mu,cov in zip(MU[:c],COV[:c])]
	return np.concatenate(A)


	def radial_kernel(c=1.5):
	def inner(a, b):
	d = a - b
	return np.exp(-1*(np.dot(d, d.conj())) / c)
	return inner

	def mutual_knn(points, n=10, distance=radial_kernel()):
	knn = {}
	kt = KDTree(points)
	for i, point in enumerate(points):
	# cannot use euclidean distance directly
	for neighbour in kt.query(point, n + 1)[1]:
	if i >= neighbour: continue
	knn.setdefault(i, []).append(
	(distance(point, points[neighbour]), neighbour))
	return knn

	def get_distance_matrix(N,knn):
	W = np.zeros((N, N))
	for point, nearest_neighbours in knn.iteritems():
	for distance, neighbour in nearest_neighbours:
	W[point][neighbour] = distance
	W[neighbour][point] = distance
	return W


	def cluster_points(L, c=3):
	# sparse eigen is a little bit faster than eig
	#evals, evcts = eigen(L, k=15, which="SM")
	evals, evcts = eig(L)
	evals, evcts = evals.real, evcts.real
	edict = dict(zip(evals, evcts.transpose()))
	evals = sorted(edict.keys())
	# second and third smallest eigenvalue + vector
	Y = np.array([edict[k] for k in evals[1:c]]).transpose()
	res, idx = kmeans2(Y, c, minit='random')
	return evals[:15], Y, rename_clusters(idx)

	def rename_clusters(idx):
	# so that first cluster has index 0
	num = -1
	seen = {}
	newidx = []
	for id in idx:
	if id not in seen:
	num += 1
	seen[id] = num
	newidx.append(seen[id])
	return np.array(newidx)


	c, num = 4, 100

	D = test_data(c,num)
	X,Y = map(array, zip(*D))

	subplot(211)
	for i in xrange(c): #enumerate(idx):
	scatter(X[inum:inum+num],Y[inum:inum+num],color=ColorMask[i])


	knn_points = mutual_knn(D)
	W = get_distance_matrix(len(D),knn_points)
	G = np.diag([sum(Wi) for Wi in W])

	# unnormalized graph Laplacian
	L = G - W
	evals, _Y, labels = cluster_points(L,c)

	# Plot the points and color according to the cluster
	subplot(212)
	for i in xrange(c): #enumerate(idx):
	idx = (labels==i)
	if not idx.any(): continue
	scatter(X[idx],Y[idx],color=ColorMask[i])
	show()