Skip to content

Instantly share code, notes, and snippets.

@spyhi
Created October 24, 2017 05:49
Show Gist options
  • Save spyhi/ec8e60419d90aefc8537eb557ef35826 to your computer and use it in GitHub Desktop.
Save spyhi/ec8e60419d90aefc8537eb557ef35826 to your computer and use it in GitHub Desktop.
Python SKLearn KMeans Cluster Analysis on UW Breast Cancer Data
import pandas as pd #Using Pandas for DataFrame
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D #Create 3D plot
from sklearn.cluster import KMeans #Import learning algorithm
# Simple KMeans cluster analysis on breast cancer data using Python, SKLearn, Numpy, and Pandas
# Created for ICS 491 (Big Data) at University of Hawaii at Manoa, Fall 2017
# Questions? Tweet me at https://twitter.com/spyhi
# Import breast cancer data using Pandas. Should load if it's in the same folder as Python script.
# Total about 570 samples.
# I got my data from Kaggle at the following URL:
# https://www.kaggle.com/uciml/breast-cancer-wisconsin-data
data = pd.read_csv('breast_cancer_data.csv')
# Was originally going to use all these feature columns
# but decided to test on a small amount to visualize and ended up being happy with results
feat_cols = ["radius_mean", "texture_mean", "texture_mean", "area_mean", \
"smoothness_mean", "compactness_mean", "concavity_mean", \
"symmetry_mean", "fractal_dimension_mean"]
# Actual features used, based on previous knowledge of skin cancer diagnosis factors
feat_cols_sm = ["radius_mean", "concavity_mean", "symmetry_mean"]
# Use Pandas dataframe query to populate Numpy array with feature vectors.
features = np.array(data[feat_cols_sm])
# Initialize the KMeans cluster module. Setting it to find two clusters, hoping to find malignant vs benign.
clusters = KMeans(n_clusters=2, max_iter=300)
# Fit model to our selected features.
clusters.fit(features)
# Put centroids and results into variables.
centroids = clusters.cluster_centers_
labels = clusters.labels_
# Sanity check
print(centroids)
# Create new MatPlotLib figure
fig = plt.figure()
# Add 3rd dimension to figure
ax = fig.add_subplot(111, projection='3d')
# This means "red" and "blue"
colors = ["r", "b"]
# Plot all the features and assign color based on cluster identity label
for i in range(len(features)):
ax.scatter(xs=features[i][0], ys=features[i][1], zs=features[i][2],
c=colors[labels[i]], zdir='z')
# Plot centroids, though you can't really see them.
ax.scatter(xs=centroids[:,0], ys=centroids[:,1], zs=centroids[:,2],
marker="x", s=150, c="c")
# Create array of diagnosis data, which should be same length as labels.
diag = np.array(data['diagnosis'])
# Create variable to hold matches in order to get percentage accuracy.
matches = 0
# Transform diagnosis vector from B||M to 0||1 and matches++ if correct.
for i in range(0, len(diag)):
if diag[i] == "B":
diag[i] = 0
if diag[i] == "M":
diag[i] = 1
if diag[i] == labels[i]:
matches = matches + 1
#Calculate percentage matches and print.
percentMatch = (matches/len(diag))*100
print("Percent matched between benign and malignant ", percentMatch)
#Set labels on figure and show 3D scatter plot to visualize data and clusters.
ax.set_xlabel("Radius Mean")
ax.set_ylabel("Concavity Mean")
ax.set_zlabel("Symmetry Mean")
plt.show()
#Finis
@up2612
Copy link

up2612 commented Apr 22, 2020

sir your program accuracy is going good but sir the plot is not being printing on console. any suggestions for it??

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment