Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
from numpy import genfromtxt
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
## Data preprocessing
# Load csv dataset into a numpy array
X = genfromtxt('/path/to/dataset', delimiter=',')
# Use MinMaxScaler to scale the data
sc = MinMaxScaler()
X = sc.fit_transform(X)
## Validation
squared_distances = []
for k in range(2, 15):
# Initialize and fit {{k}}-Means to our dataset
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)
# Keep the inertia of current model
squared_distances.append(kmeans.inertia_)
# Use matplotlib to plot inertia against k
fig = plt.figure(figsize=(15, 5), edgecolor='red')
plt.plot(range(2, 15), squared_distances)
plt.title('Elbow curve')
plt.show()
## Training
## Although we've trained models for values in k 2 through 14 for validation,
## we didn't save them due to memory concerns. Thus:
# Fit a 5-means model to our dataset.
kmeans = KMeans(n_clusters=5, random_state=42)
pred = kmeans.fit_predict(X)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment