Skip to content

Instantly share code, notes, and snippets.

@Ayushverma8
Created April 1, 2018 14:18
Show Gist options
  • Save Ayushverma8/eab8225a8f27b41b0730dd0c6416cf16 to your computer and use it in GitHub Desktop.
Save Ayushverma8/eab8225a8f27b41b0730dd0c6416cf16 to your computer and use it in GitHub Desktop.
Major Project B.Tech
Display the source blob
Display the rendered blob
Raw
# coding: utf-8
# # Breast Cancer Diagnosis
#
# ### Classify cells as benign or malignant based on cell nucleus features
# In[1]:
get_ipython().run_line_magic('matplotlib', 'inline')
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
import pydotplus
from IPython.display import Image
from sklearn import tree
from sklearn.model_selection import StratifiedKFold
from sklearn.grid_search import GridSearchCV
# In[2]:
data = pd.read_csv('data/data.csv', na_values=['NA'])
# # Preview data
# In[3]:
data.head()
# In[4]:
data = data.drop(["id", "Unnamed: 32"], 1)
data.describe()
# In[5]:
data.columns
# ### Select "mean" measures as the features
# In[6]:
features = list(data.columns[1:11])
print("{} features:".format(len(features)))
print(features)
# # Explore features
# In[7]:
# Show scatterplots of every 10 x 10
sb.pairplot(data, vars=features, hue="diagnosis", size=4)
# ### There appear to be three features that are strongly correlated
# In[8]:
area_features = ['radius_mean', 'perimeter_mean', 'area_mean']
sb.pairplot(data, vars=area_features, hue="diagnosis")
# #### The perimeter and area are both functions of the radius
# #### Specifically,
# - perimeter ~ radius_mean
# - area ~ radius_mean^2
#
# #### This is as expected, so let's carry on
# In[10]:
m = data[data["diagnosis"] == "M"]
b = data[data["diagnosis"] == "B"]
# In[11]:
fig, ax = plt.subplots(5, 2, figsize=(8,12))
axes = ax.ravel()
for i, feature in enumerate(features):
sb.distplot(m[feature], label="malignant", ax=axes[i], color='r')
sb.distplot(b[feature], label="benign", ax=axes[i])
axes[i].legend()
plt.tight_layout()
# #### The data appears normally distributed with visual differences in distributions based on diagnosis- a good sign for classification!
# # Parameter optimization and cross-validation
# In[12]:
inputs = data[features].values
diagnoses = data["diagnosis"].values
# #### This function takes in the model and parameter grid and returns the best model using 10 k-fold cross-validation
# In[16]:
def optimize_model(model, param_grid):
grid_search = GridSearchCV(model,
param_grid=param_grid,
cv=10)
grid_search.fit(inputs, diagnoses)
print("Best Score: {}".format(grid_search.best_score_))
print("Best params: {}".format(grid_search.best_params_))
return grid_search.best_estimator_
# # Logistic Regression
# In[17]:
results = {}
# In[18]:
parameter_grid = {'C': np.logspace(1, 5, 50)}
lr = optimize_model(LogisticRegression(), parameter_grid)
# In[19]:
results["Logistic Regression"] = 0.9279
# # Support Vector Machine
# In[193]:
parameter_grid = {'C': np.logspace(-5, 5, 10),
'kernel': ["rbf", "linear"]}
svc = optimize_model(SVC(), parameter_grid)
# In[20]:
results["SVM"] = 0.92091
# # Decision Trees
# In[212]:
parameter_grid = {'max_depth': list(range(1,30)),
'max_features': list(range(1,11))}
dtc = optimize_model(tree.DecisionTreeClassifier(), parameter_grid)
# In[21]:
results["Decision Tree"] = 0.931458
# ### Visualize decision tree
# In[240]:
with open("dt.dot", 'w') as f:
f = tree.export_graphviz(dtc, out_file=f)
dot_data = tree.export_graphviz(dtc, out_file=None,
feature_names=features,
class_names=["B","M"],
filled=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
# # Random Forest
# In[217]:
parameter_grid = {'n_estimators': [20, 25, 30],
'max_depth': np.logspace(0,2,10),
'max_features': list(range(1,11)),
'warm_start': [True, False]}
rf = optimize_model(RandomForestClassifier(), parameter_grid)
# In[22]:
results["Random Forest"] = 0.954305
# # Summary
# In[23]:
results
# In[24]:
ax = sb.barplot(list(results.keys()), list(results.values()))
ax.set(xlabel='Classifier', ylabel='Accuracy', title="Performance of optimized classifiers on testing set", ylim=(0.8,1))
plt.show()
# # Conclusion
# ### Based on this broad analysis of different classifiers, I find that the Random Forest classifier performs the best on the test data, with accuracy of ~95%. More test data will be required to validate the robustness of the model.
# # Looking ahead
# - Obtain/mine a larger dataset- more data > better model
# - Try neural networks and deep learning algorithmns
# - Build a semi-automated image detection tool that can extract features from cell image to diagnose cells and add to our dataset
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment