Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Kernel K-means.
"""Kernel K-means"""
# Author: Mathieu Blondel <mathieu@mblondel.org>
# License: BSD 3 clause
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.utils import check_random_state
class KernelKMeans(BaseEstimator, ClusterMixin):
"""
Kernel K-means
Reference
---------
Kernel k-means, Spectral Clustering and Normalized Cuts.
Inderjit S. Dhillon, Yuqiang Guan, Brian Kulis.
KDD 2004.
"""
def __init__(self, n_clusters=3, max_iter=50, tol=1e-3, random_state=None,
kernel="linear", gamma=None, degree=3, coef0=1,
kernel_params=None, verbose=0):
self.n_clusters = n_clusters
self.max_iter = max_iter
self.tol = tol
self.random_state = random_state
self.kernel = kernel
self.gamma = gamma
self.degree = degree
self.coef0 = coef0
self.kernel_params = kernel_params
self.verbose = verbose
@property
def _pairwise(self):
return self.kernel == "precomputed"
def _get_kernel(self, X, Y=None):
if callable(self.kernel):
params = self.kernel_params or {}
else:
params = {"gamma": self.gamma,
"degree": self.degree,
"coef0": self.coef0}
return pairwise_kernels(X, Y, metric=self.kernel,
filter_params=True, **params)
def fit(self, X, y=None, sample_weight=None):
n_samples = X.shape[0]
K = self._get_kernel(X)
sw = sample_weight if sample_weight else np.ones(n_samples)
self.sample_weight_ = sw
rs = check_random_state(self.random_state)
self.labels_ = rs.randint(self.n_clusters, size=n_samples)
dist = np.zeros((n_samples, self.n_clusters))
self.within_distances_ = np.zeros(self.n_clusters)
for it in xrange(self.max_iter):
dist.fill(0)
self._compute_dist(K, dist, self.within_distances_,
update_within=True)
labels_old = self.labels_
self.labels_ = dist.argmin(axis=1)
# Compute the number of samples whose cluster did not change
# since last iteration.
n_same = np.sum((self.labels_ - labels_old) == 0)
if 1 - float(n_same) / n_samples < self.tol:
if self.verbose:
print "Converged at iteration", it + 1
break
self.X_fit_ = X
return self
def _compute_dist(self, K, dist, within_distances, update_within):
"""Compute a n_samples x n_clusters distance matrix using the
kernel trick."""
sw = self.sample_weight_
for j in xrange(self.n_clusters):
mask = self.labels_ == j
if np.sum(mask) == 0:
raise ValueError("Empty cluster found, try smaller n_cluster.")
denom = sw[mask].sum()
denomsq = denom * denom
if update_within:
KK = K[mask][:, mask] # K[mask, mask] does not work.
dist_j = np.sum(np.outer(sw[mask], sw[mask]) * KK / denomsq)
within_distances[j] = dist_j
dist[:, j] += dist_j
else:
dist[:, j] += within_distances[j]
dist[:, j] -= 2 * np.sum(sw[mask] * K[:, mask], axis=1) / denom
def predict(self, X):
K = self._get_kernel(X, self.X_fit_)
n_samples = X.shape[0]
dist = np.zeros((n_samples, self.n_clusters))
self._compute_dist(K, dist, self.within_distances_,
update_within=False)
return dist.argmin(axis=1)
if __name__ == '__main__':
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=1000, centers=5, random_state=0)
km = KernelKMeans(n_clusters=5, max_iter=100, random_state=0, verbose=1)
print km.fit_predict(X)[:10]
print km.predict(X[:10])
@yassersouri

This comment has been minimized.

Copy link

@yassersouri yassersouri commented Jul 4, 2014

Hi. Great code, did you try sending a pull request to scikit-learn?

@mblondel

This comment has been minimized.

Copy link
Owner Author

@mblondel mblondel commented Jul 15, 2014

No, I'm too lazy to write documentation and examples :b Feel free to reuse this gist and send us a pull-request!

@ghost

This comment has been minimized.

Copy link

@ghost ghost commented Nov 27, 2014

Any advice on how to change the kernel to our own formula? Wanting a triangular sine kernel.

@amueller

This comment has been minimized.

Copy link

@amueller amueller commented Aug 28, 2015

How did this compare to spectral clustering in practice? Do you think it is worth adding to scikit-learn?

@mblondel

This comment has been minimized.

Copy link
Owner Author

@mblondel mblondel commented Oct 9, 2015

@amueller Sorry for late reply (why no notifcations in gists?!). I haven't compared it to other algorithms but I am open to inclusion in scikit-learn if someone wants to work on it.

@vishalbelsare

This comment has been minimized.

Copy link

@vishalbelsare vishalbelsare commented Oct 18, 2015

@mblondel , how related (or not) is this to Graclus? [ http://www.cs.utexas.edu/users/dml/Software/graclus.html ] Being able to work with an out-of-core kernel matrix would be a fantastic addition to scikit-learn.

@lyndaKhiali

This comment has been minimized.

Copy link

@lyndaKhiali lyndaKhiali commented Aug 4, 2016

Bonjour, est-il possible que votre algorithme reçois comme paramètre une matrice de distance au lieu d'une matrice(N_samples,N_Features).
Si oui, quel paramètre permet de spécifier le type de la matrice d'entré. merci

@ghost

This comment has been minimized.

Copy link

@ghost ghost commented Nov 9, 2016

it gives me the same result of scikit kmeans. How is it possible? I have set kernel='rbf'

@jdbsilva

This comment has been minimized.

Copy link

@jdbsilva jdbsilva commented Oct 27, 2018

This is great! Many thanks

@merveydn

This comment has been minimized.

Copy link

@merveydn merveydn commented Jun 6, 2019

What other kernels can we use? Could you give some info on how to use different kernels?

@ajilling

This comment has been minimized.

Copy link

@ajilling ajilling commented Jun 19, 2019

Is there a way to make this not fail when an empty cluster is found?

@ravi2k1

This comment has been minimized.

Copy link

@ravi2k1 ravi2k1 commented Nov 15, 2019

How can I use kmeans++ using the same code?

@lonevetad

This comment has been minimized.

Copy link

@lonevetad lonevetad commented Jan 30, 2020

Important not for everyone:
If You are looking for "gaussian kernel" just pass 'rbf' as "metric".
Sometimes I forget it.

@arnab-007

This comment has been minimized.

Copy link

@arnab-007 arnab-007 commented Aug 27, 2020

Any advice on how to extend this to multi-kernel k-means?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.