Ayushverma8/cancer_diagnosis.ipynb

## cancer_diagnosis.ipynb

# coding: utf-8

# # Breast Cancer Diagnosis
#
# ### Classify cells as benign or malignant based on cell nucleus features

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
import pydotplus
from IPython.display import Image
from sklearn import tree
from sklearn.model_selection import StratifiedKFold
from sklearn.grid_search import GridSearchCV


# In[2]:


data = pd.read_csv('data/data.csv', na_values=['NA'])


# # Preview data

# In[3]:


data.head()


# In[4]:


data = data.drop(["id", "Unnamed: 32"], 1)
data.describe()


# In[5]:


data.columns


# ### Select "mean" measures as the features

# In[6]:


features = list(data.columns[1:11])
print("{} features:".format(len(features)))
print(features)


# # Explore features

# In[7]:


# Show scatterplots of every 10 x 10
sb.pairplot(data, vars=features, hue="diagnosis", size=4)


# ### There appear to be three features that are strongly correlated

# In[8]:


area_features = ['radius_mean', 'perimeter_mean', 'area_mean']
sb.pairplot(data, vars=area_features, hue="diagnosis")


# #### The perimeter and area are both functions of the radius
# #### Specifically,
# - perimeter ~ radius_mean
# - area ~ radius_mean^2
#
# #### This is as expected, so let's carry on

# In[10]:


m = data[data["diagnosis"] == "M"]
b = data[data["diagnosis"] == "B"]


# In[11]:


fig, ax = plt.subplots(5, 2, figsize=(8,12))
axes = ax.ravel()
for i, feature in enumerate(features):
    sb.distplot(m[feature], label="malignant", ax=axes[i], color='r')
    sb.distplot(b[feature], label="benign", ax=axes[i])
    axes[i].legend()
plt.tight_layout()


# #### The data appears normally distributed with visual differences in distributions based on diagnosis- a good sign for classification!

# # Parameter optimization and cross-validation

# In[12]:


inputs = data[features].values
diagnoses = data["diagnosis"].values


# #### This function takes in the model and parameter grid and returns the best model using 10 k-fold cross-validation

# In[16]:


def optimize_model(model, param_grid):
    grid_search = GridSearchCV(model,
                              param_grid=param_grid,
                              cv=10)
    grid_search.fit(inputs, diagnoses)
    print("Best Score: {}".format(grid_search.best_score_))
    print("Best params: {}".format(grid_search.best_params_))
    return grid_search.best_estimator_


# # Logistic Regression

# In[17]:


results = {}


# In[18]:


parameter_grid = {'C': np.logspace(1, 5, 50)}
lr = optimize_model(LogisticRegression(), parameter_grid)


# In[19]:


results["Logistic Regression"] = 0.9279


# # Support Vector Machine

# In[193]:


parameter_grid = {'C': np.logspace(-5, 5, 10),
                  'kernel': ["rbf", "linear"]}
svc = optimize_model(SVC(), parameter_grid)


# In[20]:


results["SVM"] = 0.92091


# # Decision Trees

# In[212]:


parameter_grid = {'max_depth': list(range(1,30)),
                  'max_features': list(range(1,11))}
dtc = optimize_model(tree.DecisionTreeClassifier(), parameter_grid)


# In[21]:


results["Decision Tree"] = 0.931458


# ### Visualize decision tree

# In[240]:


with open("dt.dot", 'w') as f:
    f = tree.export_graphviz(dtc, out_file=f)
dot_data = tree.export_graphviz(dtc, out_file=None,
                               feature_names=features,
                               class_names=["B","M"],
                               filled=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())


# # Random Forest

# In[217]:


parameter_grid = {'n_estimators': [20, 25, 30],
                  'max_depth': np.logspace(0,2,10),
                  'max_features': list(range(1,11)),
                  'warm_start': [True, False]}
rf = optimize_model(RandomForestClassifier(), parameter_grid)


# In[22]:


results["Random Forest"] = 0.954305


# # Summary

# In[23]:


results


# In[24]:


ax = sb.barplot(list(results.keys()), list(results.values()))
ax.set(xlabel='Classifier', ylabel='Accuracy', title="Performance of optimized classifiers on testing set", ylim=(0.8,1))
plt.show()


# # Conclusion
# ### Based on this broad analysis of different classifiers, I find that the Random Forest classifier performs the best on the test data, with accuracy of ~95%. More test data will be required to validate the robustness of the model.

# # Looking ahead
# - Obtain/mine a larger dataset- more data > better model
# - Try neural networks and deep learning algorithmns
# - Build a semi-automated image detection tool that can extract features from cell image to diagnose cells and add to our dataset

	# coding: utf-8

	# # Breast Cancer Diagnosis
	#
	# ### Classify cells as benign or malignant based on cell nucleus features

	# In[1]:


	get_ipython().run_line_magic('matplotlib', 'inline')
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sb
	import numpy as np
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LogisticRegression
	from sklearn.svm import SVC, LinearSVC
	from sklearn.ensemble import RandomForestClassifier
	import pydotplus
	from IPython.display import Image
	from sklearn import tree
	from sklearn.model_selection import StratifiedKFold
	from sklearn.grid_search import GridSearchCV


	# In[2]:


	data = pd.read_csv('data/data.csv', na_values=['NA'])


	# # Preview data

	# In[3]:


	data.head()


	# In[4]:


	data = data.drop(["id", "Unnamed: 32"], 1)
	data.describe()


	# In[5]:


	data.columns


	# ### Select "mean" measures as the features

	# In[6]:


	features = list(data.columns[1:11])
	print("{} features:".format(len(features)))
	print(features)


	# # Explore features

	# In[7]:


	# Show scatterplots of every 10 x 10
	sb.pairplot(data, vars=features, hue="diagnosis", size=4)


	# ### There appear to be three features that are strongly correlated

	# In[8]:


	area_features = ['radius_mean', 'perimeter_mean', 'area_mean']
	sb.pairplot(data, vars=area_features, hue="diagnosis")


	# #### The perimeter and area are both functions of the radius
	# #### Specifically,
	# - perimeter ~ radius_mean
	# - area ~ radius_mean^2
	#
	# #### This is as expected, so let's carry on

	# In[10]:


	m = data[data["diagnosis"] == "M"]
	b = data[data["diagnosis"] == "B"]


	# In[11]:


	fig, ax = plt.subplots(5, 2, figsize=(8,12))
	axes = ax.ravel()
	for i, feature in enumerate(features):
	sb.distplot(m[feature], label="malignant", ax=axes[i], color='r')
	sb.distplot(b[feature], label="benign", ax=axes[i])
	axes[i].legend()
	plt.tight_layout()


	# #### The data appears normally distributed with visual differences in distributions based on diagnosis- a good sign for classification!

	# # Parameter optimization and cross-validation

	# In[12]:


	inputs = data[features].values
	diagnoses = data["diagnosis"].values


	# #### This function takes in the model and parameter grid and returns the best model using 10 k-fold cross-validation

	# In[16]:


	def optimize_model(model, param_grid):
	grid_search = GridSearchCV(model,
	param_grid=param_grid,
	cv=10)
	grid_search.fit(inputs, diagnoses)
	print("Best Score: {}".format(grid_search.best_score_))
	print("Best params: {}".format(grid_search.best_params_))
	return grid_search.best_estimator_


	# # Logistic Regression

	# In[17]:


	results = {}


	# In[18]:


	parameter_grid = {'C': np.logspace(1, 5, 50)}
	lr = optimize_model(LogisticRegression(), parameter_grid)


	# In[19]:


	results["Logistic Regression"] = 0.9279


	# # Support Vector Machine

	# In[193]:


	parameter_grid = {'C': np.logspace(-5, 5, 10),
	'kernel': ["rbf", "linear"]}
	svc = optimize_model(SVC(), parameter_grid)


	# In[20]:


	results["SVM"] = 0.92091


	# # Decision Trees

	# In[212]:


	parameter_grid = {'max_depth': list(range(1,30)),
	'max_features': list(range(1,11))}
	dtc = optimize_model(tree.DecisionTreeClassifier(), parameter_grid)


	# In[21]:


	results["Decision Tree"] = 0.931458


	# ### Visualize decision tree

	# In[240]:


	with open("dt.dot", 'w') as f:
	f = tree.export_graphviz(dtc, out_file=f)
	dot_data = tree.export_graphviz(dtc, out_file=None,
	feature_names=features,
	class_names=["B","M"],
	filled=True)
	graph = pydotplus.graph_from_dot_data(dot_data)
	Image(graph.create_png())


	# # Random Forest

	# In[217]:


	parameter_grid = {'n_estimators': [20, 25, 30],
	'max_depth': np.logspace(0,2,10),
	'max_features': list(range(1,11)),
	'warm_start': [True, False]}
	rf = optimize_model(RandomForestClassifier(), parameter_grid)


	# In[22]:


	results["Random Forest"] = 0.954305


	# # Summary

	# In[23]:


	results


	# In[24]:


	ax = sb.barplot(list(results.keys()), list(results.values()))
	ax.set(xlabel='Classifier', ylabel='Accuracy', title="Performance of optimized classifiers on testing set", ylim=(0.8,1))
	plt.show()


	# # Conclusion
	# ### Based on this broad analysis of different classifiers, I find that the Random Forest classifier performs the best on the test data, with accuracy of ~95%. More test data will be required to validate the robustness of the model.

	# # Looking ahead
	# - Obtain/mine a larger dataset- more data > better model
	# - Try neural networks and deep learning algorithmns
	# - Build a semi-automated image detection tool that can extract features from cell image to diagnose cells and add to our dataset