Created April 1, 2018 14:18
Major Project B.Tech
# coding: utf-8
# # Breast Cancer Diagnosis
# ### Classify cells as benign or malignant based on cell nucleus features
get_ipython().run_line_magic('matplotlib', 'inline')
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
import pydotplus
from IPython.display import Image
from sklearn import tree
from sklearn.model_selection import StratifiedKFold
from sklearn.grid_search import GridSearchCV
data = pd.read_csv('data/data.csv', na_values=['NA'])
# # Preview data
data = data.drop(["id", "Unnamed: 32"], 1)
# ### Select "mean" measures as the features
features = list(data.columns[1:11])
print("{} features:".format(len(features)))
# # Explore features
# Show scatterplots of every 10 x 10
sb.pairplot(data, vars=features, hue="diagnosis", size=4)
# ### There appear to be three features that are strongly correlated
area_features = ['radius_mean', 'perimeter_mean', 'area_mean']
sb.pairplot(data, vars=area_features, hue="diagnosis")
# #### The perimeter and area are both functions of the radius
# #### Specifically,
# - perimeter ~ radius_mean
# - area ~ radius_mean^2
# #### This is as expected, so let's carry on
m = data[data["diagnosis"] == "M"]
b = data[data["diagnosis"] == "B"]
fig, ax = plt.subplots(5, 2, figsize=(8,12))
axes = ax.ravel()
for i, feature in enumerate(features):
sb.distplot(m[feature], label="malignant", ax=axes[i], color='r')
sb.distplot(b[feature], label="benign", ax=axes[i])
# #### The data appears normally distributed with visual differences in distributions based on diagnosis- a good sign for classification!
# # Parameter optimization and cross-validation
inputs = data[features].values
diagnoses = data["diagnosis"].values
# #### This function takes in the model and parameter grid and returns the best model using 10 k-fold cross-validation
def optimize_model(model, param_grid):
grid_search = GridSearchCV(model,
cv=10), diagnoses)
print("Best Score: {}".format(grid_search.best_score_))
print("Best params: {}".format(grid_search.best_params_))
return grid_search.best_estimator_
# # Logistic Regression
results = {}
parameter_grid = {'C': np.logspace(1, 5, 50)}
lr = optimize_model(LogisticRegression(), parameter_grid)
results["Logistic Regression"] = 0.9279
# # Support Vector Machine
parameter_grid = {'C': np.logspace(-5, 5, 10),
'kernel': ["rbf", "linear"]}
svc = optimize_model(SVC(), parameter_grid)
results["SVM"] = 0.92091
# # Decision Trees
parameter_grid = {'max_depth': list(range(1,30)),
'max_features': list(range(1,11))}
dtc = optimize_model(tree.DecisionTreeClassifier(), parameter_grid)
results["Decision Tree"] = 0.931458
# ### Visualize decision tree
with open("", 'w') as f:
f = tree.export_graphviz(dtc, out_file=f)
dot_data = tree.export_graphviz(dtc, out_file=None,
graph = pydotplus.graph_from_dot_data(dot_data)
# # Random Forest
parameter_grid = {'n_estimators': [20, 25, 30],
'max_depth': np.logspace(0,2,10),
'max_features': list(range(1,11)),
'warm_start': [True, False]}
rf = optimize_model(RandomForestClassifier(), parameter_grid)
results["Random Forest"] = 0.954305
# # Summary
ax = sb.barplot(list(results.keys()), list(results.values()))
ax.set(xlabel='Classifier', ylabel='Accuracy', title="Performance of optimized classifiers on testing set", ylim=(0.8,1))
# # Conclusion
# ### Based on this broad analysis of different classifiers, I find that the Random Forest classifier performs the best on the test data, with accuracy of ~95%. More test data will be required to validate the robustness of the model.
# # Looking ahead
# - Obtain/mine a larger dataset- more data > better model
# - Try neural networks and deep learning algorithmns
# - Build a semi-automated image detection tool that can extract features from cell image to diagnose cells and add to our dataset
