Skip to content

Instantly share code, notes, and snippets.

@giuseppebonaccorso
Created August 3, 2017 13:47
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save giuseppebonaccorso/25b82ab1f7d1e9971f59ddf7fefc1455 to your computer and use it in GitHub Desktop.
Save giuseppebonaccorso/25b82ab1f7d1e9971f59ddf7fefc1455 to your computer and use it in GitHub Desktop.
Assessing clustering optimality with instability index
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics.pairwise import pairwise_distances
import multiprocessing
# Set random seed for reproducibility
np.random.seed(1000)
# Generate a dummy dataset
nb_samples = 1500
nb_features = 2
X, Y = make_blobs(n_samples=nb_samples, n_features=nb_features, centers=8, cluster_std=2.0, random_state=1000)
# Create noisy versions
nb_noisy_versions = 20
Xp = np.ndarray(shape=(nb_noisy_versions, nb_samples, nb_features))
for i in range(nb_noisy_versions):
for j in range(nb_samples):
if np.random.uniform(0, 1) < 0.5:
Xp[i, j, :] = X[j, :] + np.random.normal(scale=0.5, size=nb_features)
else:
Xp[i, j, :] = X[j, :]
# Compute the instabilities
max_nb_clusters = 15
instabilities = []
for n in range(2, max_nb_clusters+1):
Yp = []
ds = []
for k in range(nb_noisy_versions):
km = KMeans(n_clusters=n, n_jobs=multiprocessing.cpu_count())
Yp.append(km.fit_predict(Xp[k, :, :]))
for i in range(len(Yp)-1):
for j in range(i, len(Yp)):
d = pairwise_distances(Yp[i].reshape(-1, 1), Yp[j].reshape(-1, 1), 'hamming')
ds.append(d[0, 0])
instabilities.append((2.0 * np.sum(ds)) / float(nb_noisy_versions ** 2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment