Skip to content

Instantly share code, notes, and snippets.

@ayoungprogrammer
Last active August 29, 2015 14:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ayoungprogrammer/279182b8b69485ef6e6d to your computer and use it in GitHub Desktop.
Save ayoungprogrammer/279182b8b69485ef6e6d to your computer and use it in GitHub Desktop.
%matplotlib inline
import csv
import numpy as np
import scipy.cluster.vq as vq
import matplotlib.pyplot as plt
bank_csv = csv.reader(open('seeds_dataset.txt','rU'), delimiter="\t")
data = []
# Read data
for row in bank_csv:
missing = False
float_arr = []
for cell in row:
if not cell:
missing = True
break
else:
# Convert each cell to float
float_arr.append(float(cell))
# Take row if row is not missing data
if not missing:
data.append(float_arr)
data = np.array(data)
# Normalize vectors
whitened = vq.whiten(data)
# Perform k means on all features to classify into 3 groups
centroids, _ = vq.kmeans(whitened, 3)
# Classify data by distance to centroids
cls, _ = vq.vq(whitened, centroids)
# Plot first two features (area vs perimter in this case)
plt.plot(data[cls==0,0], data[cls==0,1],'ob',
data[cls==1,0], data[cls==1,1],'or',
data[cls==2,0], data[cls==2,1],'og')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment