Last active
February 11, 2018 18:12
-
-
Save jaganadhg/ddbf0956a7921b83ceef90b8a81dfaee to your computer and use it in GitHub Desktop.
Implementation of Gap Statistic from Tibshirani, Walther, Hastie to determine the inherent number of clusters in a dataset with k-means clustering.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Author : Jaganadh Gopinadhan | |
Licence : Apahce 2 | |
e-mail jaganadhg at gmail dot com | |
""" | |
import scipy | |
from sklearn.cluster import KMeans | |
from sklearn.datasets import load_iris | |
import pandas as pd | |
class TWHGapStat(object): | |
""" | |
Implementation of Gap Statistic from Tibshirani, Walther, Hastie to determine the | |
inherent number of clusters in a dataset with k-means clustering. | |
Ref Paper : https://web.stanford.edu/~hastie/Papers/gap.pdf | |
""" | |
def generate_random_data(self, X): | |
""" | |
Populate reference data. | |
Parameters | |
---------- | |
X : Numpy Array | |
The base data from which random sample has to be generated | |
Returns | |
------- | |
reference : Numpy Array | |
Reference data generated using the Numpy/Scipy random utiity . | |
NUmber of diamensions in the data will be as same as the base | |
dataset. | |
""" | |
reference = scipy.random.random_sample(size=(X.shape[0], X.shape[1])) | |
return reference | |
def _fit_cluster(self,X, n_cluster, n_iter=5): | |
""" | |
Fit cluster on reference data and return inertia mean. | |
Parameters | |
---------- | |
X : numpy array | |
The base data | |
n_cluster : int | |
The number of clusters to form | |
n_iter : int, default = 5 | |
number iterative lustering experiments has to be perfromed in the data. | |
If the data is large keep it less than 5, so that the run time will be less. | |
Returns | |
------- | |
mean_nertia : float | |
Returns the mean intertia value. | |
""" | |
iterations = range(1, n_iter + 1) | |
ref_inertias = pd.Series(index=iterations) | |
for iteration in iterations: | |
clusterer = KMeans(n_clusters=n_cluster, n_init=3, n_jobs=-1) | |
# If you are using Windows server n_jobs = -1 will be dangerous. So the | |
# value should be set to max cores - 3 . If we use all the cores available | |
# in Windows server sklearn tends to throw memory error | |
clusterer.fit(X) | |
ref_inertias[iteration] = clusterer.inertia_ | |
mean_nertia = ref_inertias.mean() | |
return mean_nertia | |
def fit(self,X,max_k): | |
""" | |
Compute Gap Statistics | |
Parameters | |
---------- | |
X : numpy array | |
The base data | |
max_k :int | |
Maximum value to which we are going to test the 'k' in k-means algorithmn | |
Returns | |
------- | |
gap_stat : Pandas Series | |
For eack k in max_k range gap stat value is returned as a Pandas Sereies. | |
Index is K and valuess correspondes to gap stattistics for each K | |
""" | |
k_range = range(1,max_k + 1) | |
gap_stat = pd.Series(index=k_range) | |
ref_data = self.generate_random_data(X) | |
for k in k_range: | |
base_clusterer = KMeans(n_clusters=k,n_init = 3, n_jobs = -1) | |
base_clusterer.fit(X) | |
ref_intertia = self._fit_cluster(ref_data,k) | |
cur_gap = scipy.log(ref_intertia - base_clusterer.inertia_) | |
gap_stat[k] = cur_gap | |
return gap_stat | |
if __name__ == "__main__": | |
iris = load_iris() | |
X = iris.data | |
gap_stat = TWHGapStat() | |
gs = gap_stat.fit(X,5) | |
print gs |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment