Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Created March 17, 2021 18:54
Show Gist options
  • Save BioSciEconomist/6a1b642a5250461fe6bc4a75c1dc8efd to your computer and use it in GitHub Desktop.
Save BioSciEconomist/6a1b642a5250461fe6bc4a75c1dc8efd to your computer and use it in GitHub Desktop.
Example k-means clustering
## *-----------------------------------------------------------------
# | PROGRAM NAME: ex kmeans.py
# | DATE: 3/17/21
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: example code based on: https://realpython.com/k-means-clustering-python/
# *----------------------------------------------------------------
df = pd.read_csv('/Users/mattbogard/Google Drive/Python Scripts/german_reunification.csv')
df = df.drop(columns="code", axis=1)
df.head()
df.tail()
df.columns
df.describe()
# reduce to just numerics
df1 = df[["gdp","infrate","trade","year"]]
# drop NAs
df1 = df1.dropna(axis=0,how='any')
df1.isnull().sum() # total missing per column
df1.shape
# get only latest year for clustering (i.e. 1992)
df1 = df1[df1.year== 1992]
df1.head()
# Import KMeans
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df1)
#-----------------------------
# find optimal number of clusters
#-----------------------------
kmeans_kwargs = {
"init": "random",
"n_init": 10,
"max_iter": 300,
"random_state": 40,
}
sse = []
for k in range(1, 11):
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
kmeans.fit(scaled_features)
sse.append(kmeans.inertia_)
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")
plt.plot(range(1, 11), sse)
plt.xticks(range(1, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()
# looks liek 5 clusters at most
kmeans = KMeans(init="random",n_clusters=5,n_init=10,max_iter=300,random_state=42)
labels = kmeans.fit(scaled_features)
# add cluster ID to data frame
df1['cluster'] = labels.labels_
df1.head()
df1.describe()
df1.shape
# descriptives
df1.groupby(['cluster']).size().reset_index(name='count')
df1.groupby('cluster')['infrate'].mean()
# Create a KMeans instance with 3 clusters: model
model = KMeans(n_clusters=3)
# Fit model to points
model.fit(points)
# Determine the cluster labels of new_points: labels
labels = model.predict(new_points)
# Print cluster labels of new_points
print(labels)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment