{{ message }}

Instantly share code, notes, and snippets.

# mblondel/kernel_kmeans.py

Last active Aug 24, 2022
Kernel K-means.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 """Kernel K-means""" # Author: Mathieu Blondel # License: BSD 3 clause import numpy as np from sklearn.base import BaseEstimator, ClusterMixin from sklearn.metrics.pairwise import pairwise_kernels from sklearn.utils import check_random_state class KernelKMeans(BaseEstimator, ClusterMixin): """ Kernel K-means Reference --------- Kernel k-means, Spectral Clustering and Normalized Cuts. Inderjit S. Dhillon, Yuqiang Guan, Brian Kulis. KDD 2004. """ def __init__(self, n_clusters=3, max_iter=50, tol=1e-3, random_state=None, kernel="linear", gamma=None, degree=3, coef0=1, kernel_params=None, verbose=0): self.n_clusters = n_clusters self.max_iter = max_iter self.tol = tol self.random_state = random_state self.kernel = kernel self.gamma = gamma self.degree = degree self.coef0 = coef0 self.kernel_params = kernel_params self.verbose = verbose @property def _pairwise(self): return self.kernel == "precomputed" def _get_kernel(self, X, Y=None): if callable(self.kernel): params = self.kernel_params or {} else: params = {"gamma": self.gamma, "degree": self.degree, "coef0": self.coef0} return pairwise_kernels(X, Y, metric=self.kernel, filter_params=True, **params) def fit(self, X, y=None, sample_weight=None): n_samples = X.shape K = self._get_kernel(X) sw = sample_weight if sample_weight else np.ones(n_samples) self.sample_weight_ = sw rs = check_random_state(self.random_state) self.labels_ = rs.randint(self.n_clusters, size=n_samples) dist = np.zeros((n_samples, self.n_clusters)) self.within_distances_ = np.zeros(self.n_clusters) for it in xrange(self.max_iter): dist.fill(0) self._compute_dist(K, dist, self.within_distances_, update_within=True) labels_old = self.labels_ self.labels_ = dist.argmin(axis=1) # Compute the number of samples whose cluster did not change # since last iteration. n_same = np.sum((self.labels_ - labels_old) == 0) if 1 - float(n_same) / n_samples < self.tol: if self.verbose: print "Converged at iteration", it + 1 break self.X_fit_ = X return self def _compute_dist(self, K, dist, within_distances, update_within): """Compute a n_samples x n_clusters distance matrix using the kernel trick.""" sw = self.sample_weight_ for j in xrange(self.n_clusters): mask = self.labels_ == j if np.sum(mask) == 0: raise ValueError("Empty cluster found, try smaller n_cluster.") denom = sw[mask].sum() denomsq = denom * denom if update_within: KK = K[mask][:, mask] # K[mask, mask] does not work. dist_j = np.sum(np.outer(sw[mask], sw[mask]) * KK / denomsq) within_distances[j] = dist_j dist[:, j] += dist_j else: dist[:, j] += within_distances[j] dist[:, j] -= 2 * np.sum(sw[mask] * K[:, mask], axis=1) / denom def predict(self, X): K = self._get_kernel(X, self.X_fit_) n_samples = X.shape dist = np.zeros((n_samples, self.n_clusters)) self._compute_dist(K, dist, self.within_distances_, update_within=False) return dist.argmin(axis=1) if __name__ == '__main__': from sklearn.datasets import make_blobs X, y = make_blobs(n_samples=1000, centers=5, random_state=0) km = KernelKMeans(n_clusters=5, max_iter=100, random_state=0, verbose=1) print km.fit_predict(X)[:10] print km.predict(X[:10])

### yassersouri commented Jul 4, 2014

Hi. Great code, did you try sending a pull request to scikit-learn?

### mblondel commented Jul 15, 2014

No, I'm too lazy to write documentation and examples :b Feel free to reuse this gist and send us a pull-request!

### ghost commented Nov 27, 2014

Any advice on how to change the kernel to our own formula? Wanting a triangular sine kernel.

### amueller commented Aug 28, 2015

How did this compare to spectral clustering in practice? Do you think it is worth adding to scikit-learn?

### mblondel commented Oct 9, 2015

@amueller Sorry for late reply (why no notifcations in gists?!). I haven't compared it to other algorithms but I am open to inclusion in scikit-learn if someone wants to work on it.

### vishalbelsare commented Oct 18, 2015

@mblondel , how related (or not) is this to Graclus? [ http://www.cs.utexas.edu/users/dml/Software/graclus.html ] Being able to work with an out-of-core kernel matrix would be a fantastic addition to scikit-learn.

### lyndaKhiali commented Aug 4, 2016

Bonjour, est-il possible que votre algorithme reçois comme paramètre une matrice de distance au lieu d'une matrice(N_samples,N_Features).
Si oui, quel paramètre permet de spécifier le type de la matrice d'entré. merci

### ghost commented Nov 9, 2016

it gives me the same result of scikit kmeans. How is it possible? I have set kernel='rbf'

### jdbsilva commented Oct 27, 2018

This is great! Many thanks

### merveydn commented Jun 6, 2019

What other kernels can we use? Could you give some info on how to use different kernels?

### ajilling commented Jun 19, 2019

Is there a way to make this not fail when an empty cluster is found?

### ravi2k1 commented Nov 15, 2019

How can I use kmeans++ using the same code?

### lonevetad commented Jan 30, 2020

Important not for everyone:
If You are looking for "gaussian kernel" just pass 'rbf' as "metric".
Sometimes I forget it.

### arnab-007 commented Aug 27, 2020

Any advice on how to extend this to multi-kernel k-means?

### mblondel commented Nov 16, 2020

The code was written for Python 2 and you're using Python 3. Replace `xrange` by `range` and `print ...` by `print(...)`.

### nancychenxizhong commented Aug 24, 2022

@amueller Sorry for late reply (why no notifcations in gists?!). I haven't compared it to other algorithms but I am open to inclusion in scikit-learn if someone wants to work on it.

@mblondel I am happy to work on it. Are you still open to inclusion of this gist to scikit-learn?