Skip to content

Instantly share code, notes, and snippets.

@alexeygrigorev
Last active November 16, 2017 20:24
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexeygrigorev/bda0b18867783abfb8e45e9a6a66ae2b to your computer and use it in GitHub Desktop.
Save alexeygrigorev/bda0b18867783abfb8e45e9a6a66ae2b to your computer and use it in GitHub Desktop.
Bisecting K-Means
import heapq
import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans
def sklearn_bisecting_kmeans_lineage(X, k, verbose=0):
N, _ = X.shape
labels = np.zeros(N, dtype=np.int)
lineage = np.zeros((k, N), dtype=np.int)
sizes_heap = []
heapq.heappush(sizes_heap, (-N, 0))
k_max = 1
while k_max < k:
if verbose and k_max % 50 == 0:
print 'iteration %d...' % k_max
size, idx = heapq.heappop(sizes_heap)
size = -size
trials = 3
while trials > 0:
if size < 100:
model = KMeans(n_clusters=2)
else:
model = MiniBatchKMeans(n_clusters=2, init='random')
km = model.fit(X[labels == idx])
it_labels = km.labels_
if (it_labels == 1).sum() > 1 and (it_labels == 0).sum() > 1:
break
trials = trials - 1
ones_size = (it_labels == 1).sum()
heapq.heappush(sizes_heap, (-ones_size, k_max))
heapq.heappush(sizes_heap, (-(size - ones_size), idx))
it_labels[it_labels == 1] = k_max
it_labels[it_labels == 0] = idx
labels[labels == idx] = it_labels
lineage[k_max - 1] = labels
k_max = k_max + 1
return labels, lineage
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment