harinathselvaraj/kmeans-clustering.ipynb

## kmeans-clustering.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              kmeans-clustering.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## kmeansplots.py
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt

from matplotlib.colors import colorConverter

__license__ = 'MIT'
__author__ = 'clintval'

def darken_rgb(rgb, p):
    """
    Will darken an rgb value by p percent
    """
    assert 0 <= p <= 1, "Proportion must be [0, 1]"
    return [int(x * (1 - p)) for x in rgb]


def lighten_rgb(rgb, p):
    """
    Will lighten an rgb value by p percent
    """
    assert 0 <= p <= 1, "Proportion must be [0, 1]"
    return [int((255 - x) * p + x) for x in rgb]


def is_luminous(rgb):
    new_color = []

    for c in rgb:
        if c <= 0.03928:
            new_color.append(c / 12.92)
        else:
            new_color.append(((c + 0.055) / 1.055) ** 2.4)
    L = sum([x * y for x, y in zip([0.2126, 0.7152, 0.0722], new_color)])

    return True if L < 0.179 else False


def kmeans_plot(X, y, cluster_centers, ax=None):
    import matplotlib.patheffects as path_effects
    from sklearn.metrics.pairwise import pairwise_distances_argmin_min

    if ax is None:
        ax = plt.gca()

    colors = cm.spectral(y.astype(float) / len(cluster_centers))
    ax.scatter(*list(zip(*X)), lw=0, c=colors, s=30)

    offset = max(list(zip(*cluster_centers))[0]) * 0.2

    for i, cluster in enumerate(cluster_centers):
        index, _ = pairwise_distances_argmin_min(cluster.reshape(1, -1), Y=X)
        cluster_color = colorConverter.to_rgb(colors[index[0]])

        if is_luminous(cluster_color) is False:
            cluster_color = darken_rgb(cluster_color, 0.35)

        label = ax.text(x=cluster[0] + offset,
                        y=cluster[1],
                        s='{:d}'.format(i + 1),
                        color=cluster_color)
        label.set_path_effects([path_effects.Stroke(lw=2, foreground='white'),
                                path_effects.Normal()])

    limit = max(*ax.get_xlim(), *ax.get_xlim())

    ax.set_xlim(0, limit)
    ax.set_ylim(0, limit)

    ax.set_xlabel("Feature space for the 1st feature")
    ax.set_ylabel("Feature space for the 2nd feature")
    return ax


def silhouette_plot(X, y, n_clusters, ax=None):
    from sklearn.metrics import silhouette_samples, silhouette_score

    if ax is None:
        ax = plt.gca()

    # Compute the silhouette scores for each sample
    silhouette_avg = silhouette_score(X, y)
    sample_silhouette_values = silhouette_samples(X, y)

    y_lower = padding = 2
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        ith_cluster_silhouette_values = sample_silhouette_values[y == i]
        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / n_clusters)
        ax.fill_betweenx(np.arange(y_lower, y_upper),
                         0,
                         ith_cluster_silhouette_values,
                         facecolor=color,
                         edgecolor=color,
                         alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i + 1))

        # Compute the new y_lower for next plot
        y_lower = y_upper + padding

    ax.set_xlabel("The silhouette coefficient values")
    ax.set_ylabel("Cluster label")

    # The vertical line for average silhoutte score of all the values
    ax.axvline(x=silhouette_avg, c='r', alpha=0.8, lw=0.8, ls='-')
    ax.annotate('Average',
                xytext=(silhouette_avg, y_lower * 1.025),
                xy=(0, 0),
                ha='center',
                alpha=0.8,
                c='r')

    ax.set_yticks([])  # Clear the yaxis labels / ticks
    ax.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1])
    ax.set_ylim(0, y_upper + 1)
    ax.set_xlim(-0.075, 1.0)
    return ax
	import numpy as np
	import matplotlib.cm as cm
	import matplotlib.pyplot as plt

	from matplotlib.colors import colorConverter

	__license__ = 'MIT'
	__author__ = 'clintval'

	def darken_rgb(rgb, p):
	"""
	Will darken an rgb value by p percent
	"""
	assert 0 <= p <= 1, "Proportion must be [0, 1]"
	return [int(x * (1 - p)) for x in rgb]


	def lighten_rgb(rgb, p):
	"""
	Will lighten an rgb value by p percent
	"""
	assert 0 <= p <= 1, "Proportion must be [0, 1]"
	return [int((255 - x) * p + x) for x in rgb]


	def is_luminous(rgb):
	new_color = []

	for c in rgb:
	if c <= 0.03928:
	new_color.append(c / 12.92)
	else:
	new_color.append(((c + 0.055) / 1.055) ** 2.4)
	L = sum([x * y for x, y in zip([0.2126, 0.7152, 0.0722], new_color)])

	return True if L < 0.179 else False


	def kmeans_plot(X, y, cluster_centers, ax=None):
	import matplotlib.patheffects as path_effects
	from sklearn.metrics.pairwise import pairwise_distances_argmin_min

	if ax is None:
	ax = plt.gca()

	colors = cm.spectral(y.astype(float) / len(cluster_centers))
	ax.scatter(list(zip(X)), lw=0, c=colors, s=30)

	offset = max(list(zip(cluster_centers))[0]) 0.2

	for i, cluster in enumerate(cluster_centers):
	index, _ = pairwise_distances_argmin_min(cluster.reshape(1, -1), Y=X)
	cluster_color = colorConverter.to_rgb(colors[index[0]])

	if is_luminous(cluster_color) is False:
	cluster_color = darken_rgb(cluster_color, 0.35)

	label = ax.text(x=cluster[0] + offset,
	y=cluster[1],
	s='{:d}'.format(i + 1),
	color=cluster_color)
	label.set_path_effects([path_effects.Stroke(lw=2, foreground='white'),
	path_effects.Normal()])

	limit = max(ax.get_xlim(), ax.get_xlim())

	ax.set_xlim(0, limit)
	ax.set_ylim(0, limit)

	ax.set_xlabel("Feature space for the 1st feature")
	ax.set_ylabel("Feature space for the 2nd feature")
	return ax


	def silhouette_plot(X, y, n_clusters, ax=None):
	from sklearn.metrics import silhouette_samples, silhouette_score

	if ax is None:
	ax = plt.gca()

	# Compute the silhouette scores for each sample
	silhouette_avg = silhouette_score(X, y)
	sample_silhouette_values = silhouette_samples(X, y)

	y_lower = padding = 2
	for i in range(n_clusters):
	# Aggregate the silhouette scores for samples belonging to
	ith_cluster_silhouette_values = sample_silhouette_values[y == i]
	ith_cluster_silhouette_values.sort()

	size_cluster_i = ith_cluster_silhouette_values.shape[0]
	y_upper = y_lower + size_cluster_i

	color = cm.spectral(float(i) / n_clusters)
	ax.fill_betweenx(np.arange(y_lower, y_upper),
	0,
	ith_cluster_silhouette_values,
	facecolor=color,
	edgecolor=color,
	alpha=0.7)

	# Label the silhouette plots with their cluster numbers at the middle
	ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i + 1))

	# Compute the new y_lower for next plot
	y_lower = y_upper + padding

	ax.set_xlabel("The silhouette coefficient values")
	ax.set_ylabel("Cluster label")

	# The vertical line for average silhoutte score of all the values
	ax.axvline(x=silhouette_avg, c='r', alpha=0.8, lw=0.8, ls='-')
	ax.annotate('Average',
	xytext=(silhouette_avg, y_lower * 1.025),
	xy=(0, 0),
	ha='center',
	alpha=0.8,
	c='r')

	ax.set_yticks([]) # Clear the yaxis labels / ticks
	ax.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1])
	ax.set_ylim(0, y_upper + 1)
	ax.set_xlim(-0.075, 1.0)
	return ax