Skip to content

Instantly share code, notes, and snippets.

@jtrive84
Created January 23, 2018 03:20
Show Gist options
  • Save jtrive84/cfc85bef9c051e373ac3c4a5f3ff5d95 to your computer and use it in GitHub Desktop.
Save jtrive84/cfc85bef9c051e373ac3c4a5f3ff5d95 to your computer and use it in GitHub Desktop.
Implementation of k-means clustering algorithm in Python.
import numpy as np
import scipy
import itertools
from scipy.spatial import distance
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 500)
np.set_printoptions(
edgeitems=3,
linewidth=500,
suppress=True,
nanstr='NaN',
infstr='Inf',
precision=5
)
def kmeans(data, k=3, max_cycles=25):
"""
Return means and cluster assignments for a given
dataset `data`.
STEP I => Assignment step: Assign each observation to the
cluster whose mean has the least squared Euclidean
distance, this is intuitively the "nearest" mean.
STEP II => Update step; Calculate New Cluster Centroids for
each observation.
"""
if isinstance(data, pd.DataFrame): data = data.values
# Add index column to data.
_tmpid = np.atleast_2d(np.arange(0, data.shape[0]))
X = np.concatenate([_tmpid.T, data], axis=1)
# Conventional initialization: Select k random means from X.
init_indx = np.random.choice(np.arange(0,X.shape[0]-1),k,replace=False)
init_means = X[init_indx][:,1:]
centroids = init_means
all_means, all_assign, all_dists = [], [], []
# Initialize indicator and distance matricies which indicate cluster
# membership for each observation.
init_clust_assign = np.zeros(shape=(X.shape[0], k))
init_clust_dists = np.zeros_like(init_clust_assign)
all_means.append(init_means)
all_assign.append(init_clust_assign)
all_dists.append(init_clust_dists)
for c in itertools.count(start=1):
clust_assign = np.zeros(shape=(X.shape[0], k))
clust_dists = np.zeros_like(clust_assign)
for i in range(X.shape[0]): # Step I (Assignment Step)
iterobs = X[i, 1:]
iterdists = [distance.euclidean(j, iterobs) for j in centroids]
min_dist = np.min(iterdists)
min_indx = np.argmin(iterdists)
clust_assign[i, min_indx] = 1
clust_dists[i, min_indx] = min_dist
# Test for convergence.
if np.allclose(clust_dists,all_dists[-1],atol=1e-09) or c>=max_cycles:
break
all_assign.append(clust_assign)
all_dists.append(clust_dists)
centroids = np.zeros_like(init_means)
for i in enumerate(centroids): # Step II (Update Step)
iter_k = i[0]
iter_v = i[1]
# Use indicies from clust_assign to retrieve elements from X
# in order to calculate centroid updates.
iter_mean = X[np.nonzero(clust_assign[:,iter_k])][:,1:].mean(axis=0)
centroids[iter_k] = iter_mean
all_means.append(centroids)
# Calculate cost function (variance) over all centroids.
total_cost = (all_dists[-1]**2).sum()
result = {
'k' :k,
'cycles' :c,
'all_means' :all_means,
'all_assign' :all_assign,
'all_dists' :all_dists,
'final_means' :all_means[-1],
'final_assign':all_assign[-1],
'final_dists' :all_dists[-1],
'total_cost' :total_cost
}
return(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment