Last active
January 18, 2023 07:37
-
-
Save IshankGulati/85e0051fb4c67e48c414 to your computer and use it in GitHub Desktop.
Clustering using Genetic Algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Clustering using Genetic Algorithm""" | |
# Author: Ishank Gulati <gulati.ishank@gmail.com> | |
from __future__ import print_function | |
from __future__ import division | |
import numpy as np | |
from scipy.spatial.distance import euclidean | |
import scipy.misc as spm | |
from matplotlib import pyplot | |
from sklearn.base import BaseEstimator, ClusterMixin | |
from sklearn.utils import check_random_state | |
from sklearn.utils import check_array | |
class GAclustering(BaseEstimator, ClusterMixin): | |
""" | |
Genetic Algorithm based Clustering | |
Reference | |
--------- | |
Genetic algorithm-based clustering technique. | |
Ujjwal Maulik, Sanghamitra Bandyoypadhyay. | |
2000. | |
""" | |
def __init__(self, n_clusters=3, population_size=10, max_iter=100, | |
prob_crossover=0.8, prob_mutation=0.001, tol=1e-3, | |
verbose=0, random_state=None): | |
self.n_clusters = n_clusters | |
self.population_size = population_size | |
self.max_iter = max_iter | |
self.prob_crossover = prob_crossover | |
self.prob_mutation = prob_mutation | |
self.tol = tol | |
self.verbose = verbose | |
self.random_state = random_state | |
def _check_fit_data(self, X): | |
X = check_array(X, dtype=np.float64) | |
n_samples, n_features = X.shape | |
if n_samples < self.n_clusters: | |
raise ValueError("Number of samples="+str(n_samples)+" should be\ | |
greater than number of clusters="+str(self.n_clusters)) | |
return X | |
def _check_test_data(self, X): | |
X = check_array(X, dtype=np.float64) | |
n_samples, n_features = X.shape | |
expected_n_features = self.centers_.shape[1] | |
if not expected_n_features == n_features: | |
raise ValueError("Incorrect number of features") | |
return X | |
def _compute_fitness(self, X, population, pop_labels, dist): | |
n_samples, n_features = X.shape | |
metric = np.zeros((self.population_size, n_features)) | |
for crm in xrange(self.population_size): | |
dist.fill(0) | |
centers = np.resize(population[crm], (self.n_clusters, n_features)) | |
for i in xrange(n_samples): | |
for j in xrange(self.n_clusters): | |
cost = euclidean(X[i], centers[j]) | |
dist[i, j] = cost | |
pop_labels[:, crm] = dist.argmin(axis=1) | |
for i in xrange(self.n_clusters): | |
mask = pop_labels[:, crm] == i | |
if np.sum(mask) == 0: | |
raise ValueError("Empty Cluster") | |
temp = X[mask] | |
count = np.shape(temp)[0] | |
centers[i] = np.sum(temp, axis=0)/count | |
metric_temp = np.zeros((count, n_features)) | |
for j in xrange(count): | |
metric_temp[j] = euclidean(temp[j], centers[i]) | |
metric[crm] += np.sum(metric_temp, axis=0) | |
centers = np.reshape(centers, (1, self.n_clusters*n_features)) | |
population[crm] = centers | |
self.fitness_[:, 0] = np.sum(1/metric, axis=1) | |
def _selection(self, population, pop_labels, n_remove): | |
for i in xrange(n_remove): | |
min_idx = np.argmin(self.fitness_, axis=0) | |
population = np.delete(population, min_idx, axis=0) | |
pop_labels = np.delete(pop_labels, min_idx, axis=1) | |
self.fitness_ = np.delete(self.fitness_, min_idx, axis=0) | |
self.population_size -= n_remove | |
def _crossover(self, population, n_features): | |
n_crossovers = int(self.prob_crossover*self.population_size//2) | |
children = np.zeros((n_crossovers*2, self.n_clusters*n_features)) | |
for cross in xrange(n_crossovers): | |
rs = check_random_state(self.random_state) | |
parents_idx = rs.randint(self.population_size, size=2) | |
children[cross*2:cross*2 + 2, :] = population[parents_idx] | |
cross_point = n_features * rs.randint(1, self.n_clusters) | |
temp = children[cross*2, cross_point:] | |
children[cross*2, cross_point:]=children[cross*2 + 1, cross_point:] | |
children[cross*2 + 1, cross_point:] = temp | |
return children | |
def _mutation(self, cross_children, n_features): | |
n_cross_child = cross_children.shape[0] | |
n_mutations = int(self.prob_mutation * n_cross_child * self.n_clusters | |
* n_features) | |
children = np.zeros((n_mutations, self.n_clusters*n_features)) | |
for mut in xrange(n_mutations): | |
rs = check_random_state(self.random_state) | |
idx = rs.randint(n_cross_child) | |
delta = rs.uniform(0, 1) | |
gene = rs.randint(self.n_clusters*n_features) | |
sign = rs.randint(2) | |
children[mut] = cross_children[idx] | |
if sign == 0: | |
if children[mut, gene] != 0: | |
children[mut, gene] += 2*delta*children[mut, gene] | |
else: | |
children[mut, gene] = 2*delta | |
else: | |
if children[mut, gene] != 0: | |
children[mut, gene] -= 2*delta*children[mut, gene] | |
else: | |
children[mut, gene] = -2*delta | |
return children | |
def fit(self, X, y=None): | |
X = self._check_fit_data(X) | |
if self.max_iter <= 0: | |
raise ValueError("Maximum number of iterations must be greater \ | |
than zero") | |
n_samples, n_features = X.shape | |
if self.population_size*self.n_clusters > n_samples: | |
raise ValueError("Population size is very large") | |
rs = check_random_state(self.random_state) | |
self.labels_ = np.zeros((n_samples)) | |
pop_labels = np.zeros((n_samples, self.population_size)) | |
population = np.zeros((self.population_size, self.n_clusters * | |
n_features)) | |
for crm in xrange(self.population_size): | |
population_idx = rs.randint(n_samples, size=self.n_clusters) | |
population[crm, :] = X[population_idx].ravel() | |
self.centers_ = np.zeros((self.n_clusters, n_features)) | |
dist = np.zeros((n_samples, self.n_clusters)) | |
self.fitness_ = np.zeros((self.population_size, 1)) | |
old_population_size = self.population_size | |
for itr in xrange(self.max_iter): | |
self._compute_fitness(X, population, pop_labels, dist) | |
fittest = np.argmax(self.fitness_, axis=0) | |
self.centers_ = np.reshape(population[fittest, :], | |
(self.n_clusters, n_features)) | |
labels_old = self.labels_ | |
self.labels_ = pop_labels[:, fittest].ravel() | |
n_same = np.sum(labels_old == self.labels_) | |
if 1-n_same/n_samples < self.tol: | |
if self.verbose: | |
print("Converged at iteration " + str(itr+1)) | |
break | |
n_remove = self.population_size-old_population_size | |
self._selection(population, pop_labels, n_remove) | |
self.population_size = old_population_size | |
cross_children = self._crossover(population, n_features) | |
mut_children = self._mutation(cross_children, n_features) | |
np.append(population, cross_children, axis=0) | |
np.append(population, mut_children, axis=0) | |
old_population_size = self.population_size | |
self.population_size = population.shape[0] | |
new_labels = rs.randint(self.n_clusters, size=(n_samples, | |
self.population_size-old_population_size)) | |
np.append(pop_labels, new_labels, axis=1) | |
self.X_fit_ = X | |
return self | |
def predict(self, X): | |
X = self._check_test_data(X) | |
n_samples = X.shape[0] | |
dist = np.zeros((n_samples, self.n_clusters)) | |
for i in xrange(n_samples): | |
for j in xrange(self.n_clusters): | |
cost = euclidean(X[i], self.centers_[j]) | |
dist[i, j] = cost | |
return dist.argmin(axis=1) | |
def segmentImage(): | |
img = spm.lena() | |
#img = spm.imread("orig.png") | |
img = spm.imresize(img, (300, 300)) | |
height, width = img.shape | |
clf = GAclustering(n_clusters=11, max_iter=100, population_size=5, | |
random_state=23, verbose=1) | |
img_list = np.reshape(img, (height*width, 1)) | |
clf.fit(img_list) | |
index = np.copy(clf.labels_) | |
index = np.reshape(index, (height, width)) | |
axes = pyplot.gca() | |
axes.imshow(index) | |
pyplot.show(block=True) | |
if __name__ == '__main__': | |
segmentImage() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment