Created
December 2, 2018 01:18
-
-
Save keizerzilla/a72056c0905e57a036e57b03b557c896 to your computer and use it in GitHub Desktop.
Dunn index for sklearn-generated clusters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
def normalize_to_smallest_integers(labels): | |
"""Normalizes a list of integers so that each number is reduced to the minimum possible integer, maintaining the order of elements. | |
:param labels: the list to be normalized | |
:returns: a numpy.array with the values normalized as the minimum integers between 0 and the maximum possible value. | |
""" | |
max_v = len(set(labels)) if -1 not in labels else len(set(labels)) - 1 | |
sorted_labels = np.sort(np.unique(labels)) | |
unique_labels = range(max_v) | |
new_c = np.zeros(len(labels), dtype=np.int32) | |
for i, clust in enumerate(sorted_labels): | |
new_c[labels == clust] = unique_labels[i] | |
return new_c | |
def dunn(labels, distances): | |
""" | |
Dunn index for cluster validation (the bigger, the better) | |
.. math:: D = \\min_{i = 1 \\ldots n_c; j = i + 1\ldots n_c} \\left\\lbrace \\frac{d \\left( c_i,c_j \\right)}{\\max_{k = 1 \\ldots n_c} \\left(diam \\left(c_k \\right) \\right)} \\right\\rbrace | |
where :math:`d(c_i,c_j)` represents the distance between | |
clusters :math:`c_i` and :math:`c_j`, given by the distances between its | |
two closest data points, and :math:`diam(c_k)` is the diameter of cluster | |
:math:`c_k`, given by the distance between its two farthest data points. | |
The bigger the value of the resulting Dunn index, the better the clustering | |
result is considered, since higher values indicate that clusters are | |
compact (small :math:`diam(c_k)`) and far apart. | |
:param labels: a list containing cluster labels for each of the n elements | |
:param distances: an n x n numpy.array containing the pairwise distances between elements | |
.. [Kovacs2005] Kovács, F., Legány, C., & Babos, A. (2005). Cluster validity measurement techniques. 6th International Symposium of Hungarian Researchers on Computational Intelligence. | |
""" | |
labels = normalize_to_smallest_integers(labels) | |
unique_cluster_distances = np.unique(min_cluster_distances(labels, distances)) | |
max_diameter = max(diameter(labels, distances)) | |
if np.size(unique_cluster_distances) > 1: | |
return unique_cluster_distances[1] / max_diameter | |
else: | |
return unique_cluster_distances[0] / max_diameter | |
def min_cluster_distances(labels, distances): | |
"""Calculates the distances between the two nearest points of each cluster. | |
:param labels: a list containing cluster labels for each of the n elements | |
:param distances: an n x n numpy.array containing the pairwise distances between elements | |
""" | |
labels = normalize_to_smallest_integers(labels) | |
n_unique_labels = len(np.unique(labels)) | |
min_distances = np.zeros((n_unique_labels, n_unique_labels)) | |
for i in np.arange(0, len(labels) - 1): | |
for ii in np.arange(i + 1, len(labels)): | |
if labels[i] != labels[ii] and distances[i, ii] < min_distances[labels[i], labels[ii]]: | |
min_distances[labels[i], labels[ii]] = min_distances[labels[ii], labels[i]] = distances[i, ii] | |
return min_distances | |
def diameter(labels, distances): | |
"""Calculates cluster diameters (the distance between the two farthest data points in a cluster) | |
:param labels: a list containing cluster labels for each of the n elements | |
:param distances: an n x n numpy.array containing the pairwise distances between elements | |
:returns: | |
""" | |
labels = normalize_to_smallest_integers(labels) | |
n_clusters = len(np.unique(labels)) | |
diameters = np.zeros(n_clusters) | |
for i in np.arange(0, len(labels) - 1): | |
for ii in np.arange(i + 1, len(labels)): | |
if labels[i] == labels[ii] and distances[i, ii] > diameters[labels[i]]: | |
diameters[labels[i]] = distances[i, ii] | |
return diameters | |
if __name__ == '__main__': | |
from sklearn.metrics.pairwise import euclidean_distances | |
from sklearn.datasets import load_iris | |
from sklearn.cluster import KMeans | |
data = load_iris() | |
c = data['target'] | |
x = data['data'] | |
k = KMeans(n_clusters=3).fit_predict(x) | |
dund = dunn(c, euclidean_distances(x)) | |
dunk = dunn(k, euclidean_distances(x)) | |
print(x, c, dund, dunk) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment