Last active
April 9, 2024 12:11
-
-
Save bitsnaps/12415200fc62539fff852a1b46168d0a to your computer and use it in GitHub Desktop.
Clustering using AgglomerativeClustering and silhouette scoring
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create a file "dataset.csv" with these values (don't forget to remove the '#' before each line): | |
# ID,Height,time_of_day,resolution | |
# 272,1.567925,1.375000,0.594089 | |
# 562,1.807508,1.458333,0.594089 | |
# 585,2.693542,0.416667,0.594089 | |
# 610,1.036305,1.458333,0.594089 | |
# 633,1.117111,0.416667,0.594089 | |
# 658,1.542407,1.458333,0.594089 | |
# 681,1.930844,0.416667,0.594089 | |
# 802,1.505548,1.458333,0.594089 | |
# 808,1.009369,1.708333,0.594089 | |
import pandas as pd | |
import numpy as np | |
from sklearn.cluster import AgglomerativeClustering | |
from sklearn.metrics import silhouette_samples, silhouette_score | |
import matplotlib.pyplot as plt | |
import matplotlib.cm as cm | |
from mpl_toolkits.mplot3d import Axes3D | |
from sklearn.neighbors import NearestCentroid | |
def clustering(df1): | |
X = df1.iloc[:].values | |
range_n_clusters = [2,3,4] | |
silhouette_values = {} | |
for n_clusters in range_n_clusters: | |
# Create a subplot with 1 row and 2 columns | |
clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward') | |
y_predict = clusterer.fit_predict(X) | |
cluster_labels = clusterer.labels_ | |
clf = NearestCentroid() | |
clf.fit(X, y_predict) | |
print("Centroids:") | |
print(clf.centroids_) | |
silhouette_avg = silhouette_score(X, cluster_labels) | |
if silhouette_avg > 0.4: | |
print("For n_clusters =", n_clusters, | |
"The average silhouette_score is :", silhouette_avg) | |
silhouette_values[n_clusters] = silhouette_avg | |
fig, (ax1, ax2) = plt.subplots(1, 2) | |
fig.set_size_inches(15, 5) | |
ax1.set_xlim([-0.1, 1]) | |
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) | |
sample_silhouette_values = silhouette_samples(X, cluster_labels) | |
y_lower = 10 | |
for i in range(n_clusters): | |
ith_cluster_silhouette_values = \ | |
sample_silhouette_values[cluster_labels == i] | |
ith_cluster_silhouette_values.sort() | |
size_cluster_i = ith_cluster_silhouette_values.shape[0] | |
y_upper = y_lower + size_cluster_i | |
color = cm.nipy_spectral(float(i) / n_clusters) | |
ax1.fill_betweenx(np.arange(y_lower, y_upper), | |
0, ith_cluster_silhouette_values, | |
facecolor=color, edgecolor=color, alpha=0.7) | |
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) | |
y_lower = y_upper + 10 # 10 for the 0 samples | |
ax1.set_title("The silhouette plot for the various clusters.") | |
ax1.set_xlabel("The silhouette coefficient values") | |
ax1.set_ylabel("Cluster label") | |
ax1.axvline(x=silhouette_avg, color="red", linestyle="--") | |
ax1.set_yticks([]) # Clear the yaxis labels / ticks | |
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) | |
ax = Axes3D(fig) | |
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) | |
ax.scatter(X[:, 1], X[:, 2], X[:, 0],marker='o', s=20, lw=0, alpha=0.7, | |
c=colors, edgecolor='k') | |
plt.suptitle(("Silhouette analysis for HAC-ward clustering on sample data " | |
"with n_clusters = %d" % n_clusters), | |
fontsize=14, fontweight='bold') | |
plt.show() | |
optimal_nbr_clusters = max(silhouette_values, key=silhouette_values.get) | |
print("Best Nbr of Clusters = %d, according to silhouette score: %.2f" % (optimal_nbr_clusters, silhouette_values[optimal_nbr_clusters])) | |
return | |
df1 = pd.read_csv('dataset.csv') | |
clustering(df1) | |
# You can add these lines to the end of clustering() function if you want to plot a dendrogram: | |
# from scipy.cluster.hierarchy import centroid, fcluster | |
# from scipy.spatial.distance import pdist | |
# import scipy.cluster.hierarchy as sch | |
# dendrogram = sch.dendrogram(sch.linkage(X, method='ward')) | |
# plt.title("Dendrogram") | |
# plt.xlabel("X") | |
# plt.ylabel("Eclidean Distances") | |
# plt.show() | |
# clusterer = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward') | |
# y_predict = clusterer.fit_predict(X) | |
# cluster_labels = clusterer.labels_ | |
# clf = NearestCentroid(metric='euclidean') | |
# clf.fit(X, y_predict) | |
# print("Centroids:") | |
# print(clf.centroids_) | |
# y = pdist(df1) | |
# print(y) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment