Skip to content

Instantly share code, notes, and snippets.

@mmmayo13
Created June 9, 2020 19:09
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mmmayo13/3d5c2b12218dfd79acc27c64b3b7dd86 to your computer and use it in GitHub Desktop.
Save mmmayo13/3d5c2b12218dfd79acc27c64b3b7dd86 to your computer and use it in GitHub Desktop.
from math import sqrt, floor
import numpy as np
def random(ds, k, random_state=42):
"""
Create random cluster centroids.
Parameters
----------
ds : numpy array
The dataset to be used for centroid initialization.
k : int
The desired number of clusters for which centroids are required.
Returns
-------
centroids : numpy array
Collection of k centroids as a numpy array.
"""
np.random.seed(random_state)
centroids = []
m = np.shape(ds)[0]
for _ in range(k):
r = np.random.randint(0, m-1)
centroids.append(ds[r])
return np.array(centroids)
def plus_plus(ds, k, random_state=42):
"""
Create cluster centroids using the k-means++ algorithm.
Parameters
----------
ds : numpy array
The dataset to be used for centroid initialization.
k : int
The desired number of clusters for which centroids are required.
Returns
-------
centroids : numpy array
Collection of k centroids as a numpy array.
Inspiration from here: https://stackoverflow.com/questions/5466323/how-could-one-implement-the-k-means-algorithm
"""
np.random.seed(random_state)
centroids = [ds[0]]
for _ in range(1, k):
dist_sq = np.array([min([np.inner(c-x,c-x) for c in centroids]) for x in ds])
probs = dist_sq/dist_sq.sum()
cumulative_probs = probs.cumsum()
r = np.random.rand()
for j, p in enumerate(cumulative_probs):
if r < p:
i = j
break
centroids.append(ds[i])
return np.array(centroids)
def naive_sharding(ds, k):
"""
Create cluster centroids using deterministic naive sharding algorithm.
Parameters
----------
ds : numpy array
The dataset to be used for centroid initialization.
k : int
The desired number of clusters for which centroids are required.
Returns
-------
centroids : numpy array
Collection of k centroids as a numpy array.
"""
def _get_mean(sums, step):
"""Vectorizable ufunc for getting means of summed shard columns."""
return sums/step
n = np.shape(ds)[1]
m = np.shape(ds)[0]
centroids = np.zeros((k, n))
composite = np.mat(np.sum(ds, axis=1))
ds = np.append(composite.T, ds, axis=1)
ds.sort(axis=0)
step = floor(m/k)
vfunc = np.vectorize(_get_mean)
for j in range(k):
if j == k-1:
centroids[j:] = vfunc(np.sum(ds[j*step:,1:], axis=0), step)
else:
centroids[j:] = vfunc(np.sum(ds[j*step:(j+1)*step,1:], axis=0), step)
return centroids
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment