Skip to content

Instantly share code, notes, and snippets.

@keizerzilla
Created December 2, 2018 01:18
Show Gist options
  • Save keizerzilla/a72056c0905e57a036e57b03b557c896 to your computer and use it in GitHub Desktop.
Save keizerzilla/a72056c0905e57a036e57b03b557c896 to your computer and use it in GitHub Desktop.
Dunn index for sklearn-generated clusters
import numpy as np
def normalize_to_smallest_integers(labels):
"""Normalizes a list of integers so that each number is reduced to the minimum possible integer, maintaining the order of elements.
:param labels: the list to be normalized
:returns: a numpy.array with the values normalized as the minimum integers between 0 and the maximum possible value.
"""
max_v = len(set(labels)) if -1 not in labels else len(set(labels)) - 1
sorted_labels = np.sort(np.unique(labels))
unique_labels = range(max_v)
new_c = np.zeros(len(labels), dtype=np.int32)
for i, clust in enumerate(sorted_labels):
new_c[labels == clust] = unique_labels[i]
return new_c
def dunn(labels, distances):
"""
Dunn index for cluster validation (the bigger, the better)
.. math:: D = \\min_{i = 1 \\ldots n_c; j = i + 1\ldots n_c} \\left\\lbrace \\frac{d \\left( c_i,c_j \\right)}{\\max_{k = 1 \\ldots n_c} \\left(diam \\left(c_k \\right) \\right)} \\right\\rbrace
where :math:`d(c_i,c_j)` represents the distance between
clusters :math:`c_i` and :math:`c_j`, given by the distances between its
two closest data points, and :math:`diam(c_k)` is the diameter of cluster
:math:`c_k`, given by the distance between its two farthest data points.
The bigger the value of the resulting Dunn index, the better the clustering
result is considered, since higher values indicate that clusters are
compact (small :math:`diam(c_k)`) and far apart.
:param labels: a list containing cluster labels for each of the n elements
:param distances: an n x n numpy.array containing the pairwise distances between elements
.. [Kovacs2005] Kovács, F., Legány, C., & Babos, A. (2005). Cluster validity measurement techniques. 6th International Symposium of Hungarian Researchers on Computational Intelligence.
"""
labels = normalize_to_smallest_integers(labels)
unique_cluster_distances = np.unique(min_cluster_distances(labels, distances))
max_diameter = max(diameter(labels, distances))
if np.size(unique_cluster_distances) > 1:
return unique_cluster_distances[1] / max_diameter
else:
return unique_cluster_distances[0] / max_diameter
def min_cluster_distances(labels, distances):
"""Calculates the distances between the two nearest points of each cluster.
:param labels: a list containing cluster labels for each of the n elements
:param distances: an n x n numpy.array containing the pairwise distances between elements
"""
labels = normalize_to_smallest_integers(labels)
n_unique_labels = len(np.unique(labels))
min_distances = np.zeros((n_unique_labels, n_unique_labels))
for i in np.arange(0, len(labels) - 1):
for ii in np.arange(i + 1, len(labels)):
if labels[i] != labels[ii] and distances[i, ii] < min_distances[labels[i], labels[ii]]:
min_distances[labels[i], labels[ii]] = min_distances[labels[ii], labels[i]] = distances[i, ii]
return min_distances
def diameter(labels, distances):
"""Calculates cluster diameters (the distance between the two farthest data points in a cluster)
:param labels: a list containing cluster labels for each of the n elements
:param distances: an n x n numpy.array containing the pairwise distances between elements
:returns:
"""
labels = normalize_to_smallest_integers(labels)
n_clusters = len(np.unique(labels))
diameters = np.zeros(n_clusters)
for i in np.arange(0, len(labels) - 1):
for ii in np.arange(i + 1, len(labels)):
if labels[i] == labels[ii] and distances[i, ii] > diameters[labels[i]]:
diameters[labels[i]] = distances[i, ii]
return diameters
if __name__ == '__main__':
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
data = load_iris()
c = data['target']
x = data['data']
k = KMeans(n_clusters=3).fit_predict(x)
dund = dunn(c, euclidean_distances(x))
dunk = dunn(k, euclidean_distances(x))
print(x, c, dund, dunk)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment