Skip to content

Instantly share code, notes, and snippets.

@lmyyao
Last active August 24, 2023 08:20
Show Gist options
  • Save lmyyao/cda93ca77eb537aef8581d4f7f0f2a82 to your computer and use it in GitHub Desktop.
Save lmyyao/cda93ca77eb537aef8581d4f7f0f2a82 to your computer and use it in GitHub Desktop.
simple kmeans demo
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# https://drive.google.com/file/d/1ZzEouo7lRJvajxK6jLM2K_p9xAwGw1tS/view
df = pd.read_csv("clustering.csv")
k = 3
epoch = 10
X = df[["LoanAmount", "ApplicantIncome"]]
samples = X.sample(k)
def L2_dis(P1, P2):
return ((P1.LoanAmount-P2.LoanAmount)**2 + (P1.ApplicantIncome-P2.ApplicantIncome)**2)**0.5
for _ in range(epoch):
L2 = []
centers = [samples.iloc[i] for i in range(k)]
for _, p in X.iterrows():
L2.append([L2_dis(p, c) for c in centers])
X["c"] = np.argmin(L2, axis=1)
samples = X.groupby("c").mean()
print(samples)
plt.scatter(X["ApplicantIncome"], X["LoanAmount"], c='black')
plt.scatter(samples["ApplicantIncome"], samples["LoanAmount"], c='red')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment