Skip to content

Instantly share code, notes, and snippets.

@fmarthoz
Last active July 7, 2021 05:00
Show Gist options
  • Save fmarthoz/7c936a0dfeb51cbc47e8757fc27d5f38 to your computer and use it in GitHub Desktop.
Save fmarthoz/7c936a0dfeb51cbc47e8757fc27d5f38 to your computer and use it in GitHub Desktop.
For a Medium article
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import seaborn as sns
from sklearn.preprocessing import StandardScaler
# We define a function returning the centroid of a set of points and the distance between two points
def centroid(df):
return(df.mean().values)
def distance(p1,p2):
p1=np.asarray(p1)
p2=np.asarray(p2)
d=np.sum((p1-p2)**2)
return(np.sqrt(d))
# STEP 0: Scale the data
X.iloc[:,0:2] = StandardScaler().fit_transform(X)
# STEP 1: we choose a number k of clusters
k=3
# Step 2: Select k random points from the data as centroids
centroids=X.sample(k)
# Using a very simple stop criterio: 10 iterations
iter=10
for n in range(0,iter):
# for each row we calculate the distance to each centroid
d=[]
for i in range(0,k):
d.append([distance(x[1],centroids.iloc[i,:]) for x in X.iterrows()])
# We assign each point to the nearest centroid
clusters=[]
d=np.asarray(d)
for j in range(0,len(d[0])):
clusters.append(np.argmin(d[0:k,j]))
# We recalculate the clusters
for m in range(0,k):
centroids.iloc[m,:]=(centroid(X[pd.Series(clusters)==m]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment