spyhi/BCclusterAnalysis.py

## BCclusterAnalysis.py
import pandas as pd #Using Pandas for DataFrame
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D #Create 3D plot
from sklearn.cluster import KMeans #Import learning algorithm

# Simple KMeans cluster analysis on breast cancer data using Python, SKLearn, Numpy, and Pandas
# Created for ICS 491 (Big Data) at University of Hawaii at Manoa, Fall 2017
# Questions? Tweet me at https://twitter.com/spyhi

# Import breast cancer data using Pandas. Should load if it's in the same folder as Python script.
# Total about 570 samples.
# I got my data from Kaggle at the following URL:
# https://www.kaggle.com/uciml/breast-cancer-wisconsin-data
data = pd.read_csv('breast_cancer_data.csv')

# Was originally going to use all these feature columns
# but decided to test on a small amount to visualize and ended up being happy with results
feat_cols = ["radius_mean", "texture_mean", "texture_mean", "area_mean", \
                      "smoothness_mean", "compactness_mean", "concavity_mean", \
                      "symmetry_mean", "fractal_dimension_mean"]

# Actual features used, based on previous knowledge of skin cancer diagnosis factors
feat_cols_sm = ["radius_mean", "concavity_mean", "symmetry_mean"]

# Use Pandas dataframe query to populate Numpy array with feature vectors.
features = np.array(data[feat_cols_sm])

# Initialize the KMeans cluster module. Setting it to find two clusters, hoping to find malignant vs benign.
clusters = KMeans(n_clusters=2, max_iter=300)

# Fit model to our selected features.
clusters.fit(features)

# Put centroids and results into variables.
centroids = clusters.cluster_centers_
labels = clusters.labels_

# Sanity check
print(centroids)

# Create new MatPlotLib figure
fig = plt.figure()
# Add 3rd dimension to figure
ax = fig.add_subplot(111, projection='3d')
# This means "red" and "blue"
colors = ["r", "b"]

# Plot all the features and assign color based on cluster identity label
for i in range(len(features)):
    ax.scatter(xs=features[i][0], ys=features[i][1], zs=features[i][2],
               c=colors[labels[i]], zdir='z')

# Plot centroids, though you can't really see them.
ax.scatter(xs=centroids[:,0], ys=centroids[:,1], zs=centroids[:,2],
           marker="x", s=150, c="c")

# Create array of diagnosis data, which should be same length as labels.
diag = np.array(data['diagnosis'])
# Create variable to hold matches in order to get percentage accuracy.
matches = 0

# Transform diagnosis vector from B||M to 0||1 and matches++ if correct.
for i in range(0, len(diag)):
    if diag[i] == "B":
        diag[i] = 0
    if diag[i] == "M":
        diag[i] = 1
    if diag[i] == labels[i]:
        matches = matches + 1

#Calculate percentage matches and print.
percentMatch = (matches/len(diag))*100
print("Percent matched between benign and malignant ", percentMatch)

#Set labels on figure and show 3D scatter plot to visualize data and clusters.
ax.set_xlabel("Radius Mean")
ax.set_ylabel("Concavity Mean")
ax.set_zlabel("Symmetry Mean")
plt.show()

#Finis
	import pandas as pd #Using Pandas for DataFrame
	import numpy as np
	import matplotlib.pyplot as plt
	from mpl_toolkits.mplot3d import Axes3D #Create 3D plot
	from sklearn.cluster import KMeans #Import learning algorithm

	# Simple KMeans cluster analysis on breast cancer data using Python, SKLearn, Numpy, and Pandas
	# Created for ICS 491 (Big Data) at University of Hawaii at Manoa, Fall 2017
	# Questions? Tweet me at https://twitter.com/spyhi

	# Import breast cancer data using Pandas. Should load if it's in the same folder as Python script.
	# Total about 570 samples.
	# I got my data from Kaggle at the following URL:
	# https://www.kaggle.com/uciml/breast-cancer-wisconsin-data
	data = pd.read_csv('breast_cancer_data.csv')

	# Was originally going to use all these feature columns
	# but decided to test on a small amount to visualize and ended up being happy with results
	feat_cols = ["radius_mean", "texture_mean", "texture_mean", "area_mean", \
	"smoothness_mean", "compactness_mean", "concavity_mean", \
	"symmetry_mean", "fractal_dimension_mean"]

	# Actual features used, based on previous knowledge of skin cancer diagnosis factors
	feat_cols_sm = ["radius_mean", "concavity_mean", "symmetry_mean"]

	# Use Pandas dataframe query to populate Numpy array with feature vectors.
	features = np.array(data[feat_cols_sm])

	# Initialize the KMeans cluster module. Setting it to find two clusters, hoping to find malignant vs benign.
	clusters = KMeans(n_clusters=2, max_iter=300)

	# Fit model to our selected features.
	clusters.fit(features)

	# Put centroids and results into variables.
	centroids = clusters.cluster_centers_
	labels = clusters.labels_

	# Sanity check
	print(centroids)

	# Create new MatPlotLib figure
	fig = plt.figure()
	# Add 3rd dimension to figure
	ax = fig.add_subplot(111, projection='3d')
	# This means "red" and "blue"
	colors = ["r", "b"]

	# Plot all the features and assign color based on cluster identity label
	for i in range(len(features)):
	ax.scatter(xs=features[i][0], ys=features[i][1], zs=features[i][2],
	c=colors[labels[i]], zdir='z')

	# Plot centroids, though you can't really see them.
	ax.scatter(xs=centroids[:,0], ys=centroids[:,1], zs=centroids[:,2],
	marker="x", s=150, c="c")

	# Create array of diagnosis data, which should be same length as labels.
	diag = np.array(data['diagnosis'])
	# Create variable to hold matches in order to get percentage accuracy.
	matches = 0

	# Transform diagnosis vector from B\|\|M to 0\|\|1 and matches++ if correct.
	for i in range(0, len(diag)):
	if diag[i] == "B":
	diag[i] = 0
	if diag[i] == "M":
	diag[i] = 1
	if diag[i] == labels[i]:
	matches = matches + 1

	#Calculate percentage matches and print.
	percentMatch = (matches/len(diag))*100
	print("Percent matched between benign and malignant ", percentMatch)

	#Set labels on figure and show 3D scatter plot to visualize data and clusters.
	ax.set_xlabel("Radius Mean")
	ax.set_ylabel("Concavity Mean")
	ax.set_zlabel("Symmetry Mean")
	plt.show()

	#Finis