Instantly share code, notes, and snippets.

# StuartGordonReid/ClusteringObjectiveFunctions.py

Created June 15, 2015 14:43
Star You must be signed in to star a gist
Clustering Objective Functions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 class ClusteringQuality: """ Instances of this class implement the two measures of clustering quality discussed in the article, namely the davies bouldin index and the silhouette index. It also implements a number of useful helper methods. :param solution: the clustering solution of type Clustering :param minimum: the minimum distance allowable """ def __init__(self, solution, minimum): """ Initializes a ClusteringQuality object with a given Clustering solution and a minimum distance :param solution: this is an object of type Clustering :param minimum: this is the minimum distance allowed between two points """ assert isinstance(solution, Clustering) self.solution = solution self.e = minimum def cluster_totals(self): """ This method calculates the total distance from every centroid to every pattern assigned to it. It also records the number of patterns in each cluster which are used to compute average distances in cluster_averages() :return: a two dimensional list of [total cluster distance, total patterns in cluster] for each centroid """ s = Similarity(self.e) # create array (will be 2d) to store total internal cluster distances and cluster counts for each centroid cluster_distances_counts = [] for i in range(self.solution.num_clusters): ith_cluster_count = 0.0 ith_cluster_distance = 0.0 for z in range(len(self.solution.solution)): # update the count and the total distance for the centroid z[i] belongs to (whichever one that is) if self.solution.solution[z] == i: ith_cluster_count += 1 ith_cluster_distance += s.fractional_distance(self.solution.patterns[z], self.solution.centroids[i]) # add the result to the 2d list cluster_distances_counts.append([ith_cluster_distance, max(ith_cluster_count, 1.0)]) return np.array(cluster_distances_counts) def cluster_averages(self): """ Receives output from cluster_totals() and computes the average distance per centroid :return: average distance from each centroid to the patterns assigned to it """ # create list to store averages in cluster_averages = [] # get the total internal cluster distances plus the counts for each centroid / cluster cluster_distances_counts = self.cluster_totals() for i in range(len(cluster_distances_counts)): # calculate the averages and add it to the list cluster_averages.append(cluster_distances_counts[i] / cluster_distances_counts[i]) return np.array(cluster_averages) def davies_bouldin(self): """ This method computes the davies-bouldin (db) of a given clustering. :return: the davies bouldin value of the clustering """ # get the average internal cluster distances cluster_averages = self.cluster_averages() # create variable for db davies_bouldin = 0.0 s = Similarity(self.e) # for each cluster / centroid i for i in range(self.solution.num_clusters): # for each cluster / centroid j for j in range(self.solution.num_clusters): # when i and j are not the same cluster / centroid if j != i: # calculate the distance between the two centroids of i and j d_ij = s.fractional_distance(self.solution.centroids[i], self.solution.centroids[j]) # update the variable to equal to sum of internal cluster distances of clusters i and j divided by # the previously computer value i.e. the distance between centroid i and centroid j d_ij = (cluster_averages[i] + cluster_averages[j]) / d_ij # update db is this is larger than any db seen before davies_bouldin = max(d_ij, davies_bouldin) return davies_bouldin def silhouette_index(self, index): """ This method computes the silhouette index (si) for any given pattern between -1 and 1 :param index: the pattern we are looking at now :return: the silhouette index for that pattern """ # store the total distance to each cluster silhouette_totals = [] # store the number of patterns in each cluster silhouette_counts = [] # initialize the variables for i in range(self.solution.num_clusters): silhouette_totals.append(0.0) silhouette_counts.append(0.0) s = Similarity(self.e) for i in range(len(self.solution.patterns)): # for every pattern other than the one we are calculating now if i != index: # get the distance between pattern[index] and that pattern distance = s.fractional_distance(self.solution.patterns[i], self.solution.patterns[index]) # add that distance to the silhouette totals for the correct cluster silhouette_totals[self.solution.solution[i]] += distance # update the number of patterns in that cluster silhouette_counts[self.solution.solution[i]] += 1 # setup variable to find the cluster (not equal to the pattern[index]'s cluster) with the smallest distance smallest_silhouette = silhouette_totals / max(1.0, silhouette_counts) for i in range(len(silhouette_totals)): # calculate the average distance of each pattern in that cluster from pattern[index] silhouette = silhouette_totals[i] / max(1.0, silhouette_counts[i]) # if the average distance is lower and it isn't pattern[index] cluster update the value if silhouette < smallest_silhouette and i != self.solution.solution[index]: smallest_silhouette = silhouette # calculate the internal cluster distances for pattern[index] index_cluster = self.solution.solution[index] index_silhouette = self.e + silhouette_totals[index_cluster] / max(1.0, silhouette_counts[index_cluster]) # return the ratio between the smallest distance from pattern[index] to another cluster's patterns and # the patterns belong to the same cluster as pattern[index] return (smallest_silhouette - index_silhouette) / max(smallest_silhouette, index_silhouette) def silhouette_index_zero_one(self, index): """ Returns the silhouette index between 0 and 1 and makes it a minimization objective (easier) :param index: the pattern we are looking at now :return: the silhouette index for that pattern """ return 1 - ((1 + self.silhouette_index(index)) / 2.0) def average_silhouette_index(self, scaled_zero_one=True): """ This method computes the average silhouette index value every pattern in the data set. :param scaled_zero_one: allows you to scale the result between 0 and 1 and reverse the order :return: the silhouette index of the given clustering """ silhouette_sum = 0.0 for i in range(len(self.solution.patterns)): if scaled_zero_one: silhouette_sum += self.silhouette_index_zero_one(i) else: silhouette_sum += self.silhouette_index(i) return silhouette_sum / len(self.solution.patterns) def quantization_error(self): """ This method calculates the quantization error of the given clustering :return: the quantization error """ total_distance = 0.0 s = Similarity(self.e) for i in range(len(self.solution.patterns)): total_distance += math.pow(s.fractional_distance(self.solution.patterns[i], self.solution.centroids[self.solution.solution[i]]), 2.0) return total_distance / len(self.solution.patterns)

### ihsansatriawan commented May 9, 2016

Thx for the code 👍 , but i think it's will be awesome if you can give example :)