Created
January 23, 2018 03:20
-
-
Save jtrive84/cfc85bef9c051e373ac3c4a5f3ff5d95 to your computer and use it in GitHub Desktop.
Implementation of k-means clustering algorithm in Python.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import scipy | |
import itertools | |
from scipy.spatial import distance | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
pd.set_option('display.max_columns', 1000) | |
pd.set_option('display.width', 500) | |
np.set_printoptions( | |
edgeitems=3, | |
linewidth=500, | |
suppress=True, | |
nanstr='NaN', | |
infstr='Inf', | |
precision=5 | |
) | |
def kmeans(data, k=3, max_cycles=25): | |
""" | |
Return means and cluster assignments for a given | |
dataset `data`. | |
STEP I => Assignment step: Assign each observation to the | |
cluster whose mean has the least squared Euclidean | |
distance, this is intuitively the "nearest" mean. | |
STEP II => Update step; Calculate New Cluster Centroids for | |
each observation. | |
""" | |
if isinstance(data, pd.DataFrame): data = data.values | |
# Add index column to data. | |
_tmpid = np.atleast_2d(np.arange(0, data.shape[0])) | |
X = np.concatenate([_tmpid.T, data], axis=1) | |
# Conventional initialization: Select k random means from X. | |
init_indx = np.random.choice(np.arange(0,X.shape[0]-1),k,replace=False) | |
init_means = X[init_indx][:,1:] | |
centroids = init_means | |
all_means, all_assign, all_dists = [], [], [] | |
# Initialize indicator and distance matricies which indicate cluster | |
# membership for each observation. | |
init_clust_assign = np.zeros(shape=(X.shape[0], k)) | |
init_clust_dists = np.zeros_like(init_clust_assign) | |
all_means.append(init_means) | |
all_assign.append(init_clust_assign) | |
all_dists.append(init_clust_dists) | |
for c in itertools.count(start=1): | |
clust_assign = np.zeros(shape=(X.shape[0], k)) | |
clust_dists = np.zeros_like(clust_assign) | |
for i in range(X.shape[0]): # Step I (Assignment Step) | |
iterobs = X[i, 1:] | |
iterdists = [distance.euclidean(j, iterobs) for j in centroids] | |
min_dist = np.min(iterdists) | |
min_indx = np.argmin(iterdists) | |
clust_assign[i, min_indx] = 1 | |
clust_dists[i, min_indx] = min_dist | |
# Test for convergence. | |
if np.allclose(clust_dists,all_dists[-1],atol=1e-09) or c>=max_cycles: | |
break | |
all_assign.append(clust_assign) | |
all_dists.append(clust_dists) | |
centroids = np.zeros_like(init_means) | |
for i in enumerate(centroids): # Step II (Update Step) | |
iter_k = i[0] | |
iter_v = i[1] | |
# Use indicies from clust_assign to retrieve elements from X | |
# in order to calculate centroid updates. | |
iter_mean = X[np.nonzero(clust_assign[:,iter_k])][:,1:].mean(axis=0) | |
centroids[iter_k] = iter_mean | |
all_means.append(centroids) | |
# Calculate cost function (variance) over all centroids. | |
total_cost = (all_dists[-1]**2).sum() | |
result = { | |
'k' :k, | |
'cycles' :c, | |
'all_means' :all_means, | |
'all_assign' :all_assign, | |
'all_dists' :all_dists, | |
'final_means' :all_means[-1], | |
'final_assign':all_assign[-1], | |
'final_dists' :all_dists[-1], | |
'total_cost' :total_cost | |
} | |
return(result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment