Created
January 26, 2024 00:24
K-Means
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Full Python code to perform clustering and calculate statistics | |
import pandas as pd | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.decomposition import PCA | |
from sklearn.cluster import KMeans | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
# Assuming 'data' is the original DataFrame and 'features' are the feature columns | |
# Scaling the data | |
scaler = StandardScaler() | |
scaled_features = scaler.fit_transform(features) | |
# Performing PCA | |
pca = PCA() | |
principal_components = pca.fit_transform(scaled_features) | |
# Performing KMeans clustering with 5 clusters | |
kmeans = KMeans(n_clusters=5, random_state=0).fit(principal_components) | |
labels = kmeans.labels_ | |
# Adding the cluster labels to the original DataFrame | |
data_with_clusters = data.copy() | |
data_with_clusters['Cluster'] = labels | |
# Calculating mean values of each feature for each cluster | |
cluster_means = data_with_clusters.groupby('Cluster').mean() | |
# Calculating the percentage of diabetes cases in each cluster | |
cluster_diabetes_percentage = data_with_clusters.groupby('Cluster')['Diabetes_binary'].mean() * 100 | |
# Adding the diabetes rate to the cluster means DataFrame | |
cluster_means_with_diabetes_rate = cluster_means.copy() | |
cluster_means_with_diabetes_rate['Diabetes Rate'] = cluster_diabetes_percentage | |
# Renaming the index to 'Cluster' and resetting index | |
cluster_means_with_diabetes_rate.index.name = 'Cluster' | |
cluster_means_with_diabetes_rate.reset_index(inplace=True) | |
# The final DataFrame | |
cluster_means_with_diabetes_rate |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment