Skip to content

Instantly share code, notes, and snippets.

@IshankGulati
Last active January 18, 2023 07:37
Show Gist options
  • Save IshankGulati/85e0051fb4c67e48c414 to your computer and use it in GitHub Desktop.
Save IshankGulati/85e0051fb4c67e48c414 to your computer and use it in GitHub Desktop.
Clustering using Genetic Algorithm
"""Clustering using Genetic Algorithm"""
# Author: Ishank Gulati <gulati.ishank@gmail.com>
from __future__ import print_function
from __future__ import division
import numpy as np
from scipy.spatial.distance import euclidean
import scipy.misc as spm
from matplotlib import pyplot
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state
from sklearn.utils import check_array
class GAclustering(BaseEstimator, ClusterMixin):
"""
Genetic Algorithm based Clustering
Reference
---------
Genetic algorithm-based clustering technique.
Ujjwal Maulik, Sanghamitra Bandyoypadhyay.
2000.
"""
def __init__(self, n_clusters=3, population_size=10, max_iter=100,
prob_crossover=0.8, prob_mutation=0.001, tol=1e-3,
verbose=0, random_state=None):
self.n_clusters = n_clusters
self.population_size = population_size
self.max_iter = max_iter
self.prob_crossover = prob_crossover
self.prob_mutation = prob_mutation
self.tol = tol
self.verbose = verbose
self.random_state = random_state
def _check_fit_data(self, X):
X = check_array(X, dtype=np.float64)
n_samples, n_features = X.shape
if n_samples < self.n_clusters:
raise ValueError("Number of samples="+str(n_samples)+" should be\
greater than number of clusters="+str(self.n_clusters))
return X
def _check_test_data(self, X):
X = check_array(X, dtype=np.float64)
n_samples, n_features = X.shape
expected_n_features = self.centers_.shape[1]
if not expected_n_features == n_features:
raise ValueError("Incorrect number of features")
return X
def _compute_fitness(self, X, population, pop_labels, dist):
n_samples, n_features = X.shape
metric = np.zeros((self.population_size, n_features))
for crm in xrange(self.population_size):
dist.fill(0)
centers = np.resize(population[crm], (self.n_clusters, n_features))
for i in xrange(n_samples):
for j in xrange(self.n_clusters):
cost = euclidean(X[i], centers[j])
dist[i, j] = cost
pop_labels[:, crm] = dist.argmin(axis=1)
for i in xrange(self.n_clusters):
mask = pop_labels[:, crm] == i
if np.sum(mask) == 0:
raise ValueError("Empty Cluster")
temp = X[mask]
count = np.shape(temp)[0]
centers[i] = np.sum(temp, axis=0)/count
metric_temp = np.zeros((count, n_features))
for j in xrange(count):
metric_temp[j] = euclidean(temp[j], centers[i])
metric[crm] += np.sum(metric_temp, axis=0)
centers = np.reshape(centers, (1, self.n_clusters*n_features))
population[crm] = centers
self.fitness_[:, 0] = np.sum(1/metric, axis=1)
def _selection(self, population, pop_labels, n_remove):
for i in xrange(n_remove):
min_idx = np.argmin(self.fitness_, axis=0)
population = np.delete(population, min_idx, axis=0)
pop_labels = np.delete(pop_labels, min_idx, axis=1)
self.fitness_ = np.delete(self.fitness_, min_idx, axis=0)
self.population_size -= n_remove
def _crossover(self, population, n_features):
n_crossovers = int(self.prob_crossover*self.population_size//2)
children = np.zeros((n_crossovers*2, self.n_clusters*n_features))
for cross in xrange(n_crossovers):
rs = check_random_state(self.random_state)
parents_idx = rs.randint(self.population_size, size=2)
children[cross*2:cross*2 + 2, :] = population[parents_idx]
cross_point = n_features * rs.randint(1, self.n_clusters)
temp = children[cross*2, cross_point:]
children[cross*2, cross_point:]=children[cross*2 + 1, cross_point:]
children[cross*2 + 1, cross_point:] = temp
return children
def _mutation(self, cross_children, n_features):
n_cross_child = cross_children.shape[0]
n_mutations = int(self.prob_mutation * n_cross_child * self.n_clusters
* n_features)
children = np.zeros((n_mutations, self.n_clusters*n_features))
for mut in xrange(n_mutations):
rs = check_random_state(self.random_state)
idx = rs.randint(n_cross_child)
delta = rs.uniform(0, 1)
gene = rs.randint(self.n_clusters*n_features)
sign = rs.randint(2)
children[mut] = cross_children[idx]
if sign == 0:
if children[mut, gene] != 0:
children[mut, gene] += 2*delta*children[mut, gene]
else:
children[mut, gene] = 2*delta
else:
if children[mut, gene] != 0:
children[mut, gene] -= 2*delta*children[mut, gene]
else:
children[mut, gene] = -2*delta
return children
def fit(self, X, y=None):
X = self._check_fit_data(X)
if self.max_iter <= 0:
raise ValueError("Maximum number of iterations must be greater \
than zero")
n_samples, n_features = X.shape
if self.population_size*self.n_clusters > n_samples:
raise ValueError("Population size is very large")
rs = check_random_state(self.random_state)
self.labels_ = np.zeros((n_samples))
pop_labels = np.zeros((n_samples, self.population_size))
population = np.zeros((self.population_size, self.n_clusters *
n_features))
for crm in xrange(self.population_size):
population_idx = rs.randint(n_samples, size=self.n_clusters)
population[crm, :] = X[population_idx].ravel()
self.centers_ = np.zeros((self.n_clusters, n_features))
dist = np.zeros((n_samples, self.n_clusters))
self.fitness_ = np.zeros((self.population_size, 1))
old_population_size = self.population_size
for itr in xrange(self.max_iter):
self._compute_fitness(X, population, pop_labels, dist)
fittest = np.argmax(self.fitness_, axis=0)
self.centers_ = np.reshape(population[fittest, :],
(self.n_clusters, n_features))
labels_old = self.labels_
self.labels_ = pop_labels[:, fittest].ravel()
n_same = np.sum(labels_old == self.labels_)
if 1-n_same/n_samples < self.tol:
if self.verbose:
print("Converged at iteration " + str(itr+1))
break
n_remove = self.population_size-old_population_size
self._selection(population, pop_labels, n_remove)
self.population_size = old_population_size
cross_children = self._crossover(population, n_features)
mut_children = self._mutation(cross_children, n_features)
np.append(population, cross_children, axis=0)
np.append(population, mut_children, axis=0)
old_population_size = self.population_size
self.population_size = population.shape[0]
new_labels = rs.randint(self.n_clusters, size=(n_samples,
self.population_size-old_population_size))
np.append(pop_labels, new_labels, axis=1)
self.X_fit_ = X
return self
def predict(self, X):
X = self._check_test_data(X)
n_samples = X.shape[0]
dist = np.zeros((n_samples, self.n_clusters))
for i in xrange(n_samples):
for j in xrange(self.n_clusters):
cost = euclidean(X[i], self.centers_[j])
dist[i, j] = cost
return dist.argmin(axis=1)
def segmentImage():
img = spm.lena()
#img = spm.imread("orig.png")
img = spm.imresize(img, (300, 300))
height, width = img.shape
clf = GAclustering(n_clusters=11, max_iter=100, population_size=5,
random_state=23, verbose=1)
img_list = np.reshape(img, (height*width, 1))
clf.fit(img_list)
index = np.copy(clf.labels_)
index = np.reshape(index, (height, width))
axes = pyplot.gca()
axes.imshow(index)
pyplot.show(block=True)
if __name__ == '__main__':
segmentImage()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment