Skip to content

Instantly share code, notes, and snippets.

@smzn
Created January 26, 2024 00:24
K-Means
# Full Python code to perform clustering and calculate statistics
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data' is the original DataFrame and 'features' are the feature columns
# Scaling the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
# Performing PCA
pca = PCA()
principal_components = pca.fit_transform(scaled_features)
# Performing KMeans clustering with 5 clusters
kmeans = KMeans(n_clusters=5, random_state=0).fit(principal_components)
labels = kmeans.labels_
# Adding the cluster labels to the original DataFrame
data_with_clusters = data.copy()
data_with_clusters['Cluster'] = labels
# Calculating mean values of each feature for each cluster
cluster_means = data_with_clusters.groupby('Cluster').mean()
# Calculating the percentage of diabetes cases in each cluster
cluster_diabetes_percentage = data_with_clusters.groupby('Cluster')['Diabetes_binary'].mean() * 100
# Adding the diabetes rate to the cluster means DataFrame
cluster_means_with_diabetes_rate = cluster_means.copy()
cluster_means_with_diabetes_rate['Diabetes Rate'] = cluster_diabetes_percentage
# Renaming the index to 'Cluster' and resetting index
cluster_means_with_diabetes_rate.index.name = 'Cluster'
cluster_means_with_diabetes_rate.reset_index(inplace=True)
# The final DataFrame
cluster_means_with_diabetes_rate
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment