sjstebbins/GA_DSI.ipynb

## GA_DSI.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  #Part I"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Python Coding and Data Set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# read csv and add header\n",
    "header_file = open('./bc/field_names.txt', 'r')\n",
    "header_columns = [line.rstrip() for line in header_file]\n",
    "bc_data = pd.read_csv(\"./bc/breast-cancer.csv\", sep=',', names = header_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/spencerjames/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
     ]
    }
   ],
   "source": [
    "# subset data and target columns\n",
    "target = bc_data['diagnosis']\n",
    "data  = bc_data.loc[:, bc_data.columns != 'diagnosis']\n",
    "# data munging\n",
    "data.drop('ID', inplace=True, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>diagnosis</th>\n",
       "      <th>radius_mean</th>\n",
       "      <th>radius_sd_error</th>\n",
       "      <th>radius_worst</th>\n",
       "      <th>texture_mean</th>\n",
       "      <th>texture_sd_error</th>\n",
       "      <th>texture_worst</th>\n",
       "      <th>perimeter_mean</th>\n",
       "      <th>perimeter_sd_error</th>\n",
       "      <th>...</th>\n",
       "      <th>concavity_worst</th>\n",
       "      <th>concave_points_mean</th>\n",
       "      <th>concave_points_sd_error</th>\n",
       "      <th>concave_points_worst</th>\n",
       "      <th>symmetry_mean</th>\n",
       "      <th>symmetry_sd_error</th>\n",
       "      <th>symmetry_worst</th>\n",
       "      <th>fractal_dimension_mean</th>\n",
       "      <th>fractal_dimension_sd_error</th>\n",
       "      <th>fractal_dimension_worst</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>842302</td>\n",
       "      <td>M</td>\n",
       "      <td>17.99</td>\n",
       "      <td>10.38</td>\n",
       "      <td>122.80</td>\n",
       "      <td>1001.0</td>\n",
       "      <td>0.11840</td>\n",
       "      <td>0.27760</td>\n",
       "      <td>0.3001</td>\n",
       "      <td>0.14710</td>\n",
       "      <td>...</td>\n",
       "      <td>25.38</td>\n",
       "      <td>17.33</td>\n",
       "      <td>184.60</td>\n",
       "      <td>2019.0</td>\n",
       "      <td>0.1622</td>\n",
       "      <td>0.6656</td>\n",
       "      <td>0.7119</td>\n",
       "      <td>0.2654</td>\n",
       "      <td>0.4601</td>\n",
       "      <td>0.11890</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>842517</td>\n",
       "      <td>M</td>\n",
       "      <td>20.57</td>\n",
       "      <td>17.77</td>\n",
       "      <td>132.90</td>\n",
       "      <td>1326.0</td>\n",
       "      <td>0.08474</td>\n",
       "      <td>0.07864</td>\n",
       "      <td>0.0869</td>\n",
       "      <td>0.07017</td>\n",
       "      <td>...</td>\n",
       "      <td>24.99</td>\n",
       "      <td>23.41</td>\n",
       "      <td>158.80</td>\n",
       "      <td>1956.0</td>\n",
       "      <td>0.1238</td>\n",
       "      <td>0.1866</td>\n",
       "      <td>0.2416</td>\n",
       "      <td>0.1860</td>\n",
       "      <td>0.2750</td>\n",
       "      <td>0.08902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>84300903</td>\n",
       "      <td>M</td>\n",
       "      <td>19.69</td>\n",
       "      <td>21.25</td>\n",
       "      <td>130.00</td>\n",
       "      <td>1203.0</td>\n",
       "      <td>0.10960</td>\n",
       "      <td>0.15990</td>\n",
       "      <td>0.1974</td>\n",
       "      <td>0.12790</td>\n",
       "      <td>...</td>\n",
       "      <td>23.57</td>\n",
       "      <td>25.53</td>\n",
       "      <td>152.50</td>\n",
       "      <td>1709.0</td>\n",
       "      <td>0.1444</td>\n",
       "      <td>0.4245</td>\n",
       "      <td>0.4504</td>\n",
       "      <td>0.2430</td>\n",
       "      <td>0.3613</td>\n",
       "      <td>0.08758</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>84348301</td>\n",
       "      <td>M</td>\n",
       "      <td>11.42</td>\n",
       "      <td>20.38</td>\n",
       "      <td>77.58</td>\n",
       "      <td>386.1</td>\n",
       "      <td>0.14250</td>\n",
       "      <td>0.28390</td>\n",
       "      <td>0.2414</td>\n",
       "      <td>0.10520</td>\n",
       "      <td>...</td>\n",
       "      <td>14.91</td>\n",
       "      <td>26.50</td>\n",
       "      <td>98.87</td>\n",
       "      <td>567.7</td>\n",
       "      <td>0.2098</td>\n",
       "      <td>0.8663</td>\n",
       "      <td>0.6869</td>\n",
       "      <td>0.2575</td>\n",
       "      <td>0.6638</td>\n",
       "      <td>0.17300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>84358402</td>\n",
       "      <td>M</td>\n",
       "      <td>20.29</td>\n",
       "      <td>14.34</td>\n",
       "      <td>135.10</td>\n",
       "      <td>1297.0</td>\n",
       "      <td>0.10030</td>\n",
       "      <td>0.13280</td>\n",
       "      <td>0.1980</td>\n",
       "      <td>0.10430</td>\n",
       "      <td>...</td>\n",
       "      <td>22.54</td>\n",
       "      <td>16.67</td>\n",
       "      <td>152.20</td>\n",
       "      <td>1575.0</td>\n",
       "      <td>0.1374</td>\n",
       "      <td>0.2050</td>\n",
       "      <td>0.4000</td>\n",
       "      <td>0.1625</td>\n",
       "      <td>0.2364</td>\n",
       "      <td>0.07678</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 32 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         ID diagnosis  radius_mean  radius_sd_error  radius_worst  \\\n",
       "0    842302         M        17.99            10.38        122.80   \n",
       "1    842517         M        20.57            17.77        132.90   \n",
       "2  84300903         M        19.69            21.25        130.00   \n",
       "3  84348301         M        11.42            20.38         77.58   \n",
       "4  84358402         M        20.29            14.34        135.10   \n",
       "\n",
       "   texture_mean  texture_sd_error  texture_worst  perimeter_mean  \\\n",
       "0        1001.0           0.11840        0.27760          0.3001   \n",
       "1        1326.0           0.08474        0.07864          0.0869   \n",
       "2        1203.0           0.10960        0.15990          0.1974   \n",
       "3         386.1           0.14250        0.28390          0.2414   \n",
       "4        1297.0           0.10030        0.13280          0.1980   \n",
       "\n",
       "   perimeter_sd_error           ...             concavity_worst  \\\n",
       "0             0.14710           ...                       25.38   \n",
       "1             0.07017           ...                       24.99   \n",
       "2             0.12790           ...                       23.57   \n",
       "3             0.10520           ...                       14.91   \n",
       "4             0.10430           ...                       22.54   \n",
       "\n",
       "   concave_points_mean  concave_points_sd_error  concave_points_worst  \\\n",
       "0                17.33                   184.60                2019.0   \n",
       "1                23.41                   158.80                1956.0   \n",
       "2                25.53                   152.50                1709.0   \n",
       "3                26.50                    98.87                 567.7   \n",
       "4                16.67                   152.20                1575.0   \n",
       "\n",
       "   symmetry_mean  symmetry_sd_error  symmetry_worst  fractal_dimension_mean  \\\n",
       "0         0.1622             0.6656          0.7119                  0.2654   \n",
       "1         0.1238             0.1866          0.2416                  0.1860   \n",
       "2         0.1444             0.4245          0.4504                  0.2430   \n",
       "3         0.2098             0.8663          0.6869                  0.2575   \n",
       "4         0.1374             0.2050          0.4000                  0.1625   \n",
       "\n",
       "   fractal_dimension_sd_error  fractal_dimension_worst  \n",
       "0                      0.4601                  0.11890  \n",
       "1                      0.2750                  0.08902  \n",
       "2                      0.3613                  0.08758  \n",
       "3                      0.6638                  0.17300  \n",
       "4                      0.2364                  0.07678  \n",
       "\n",
       "[5 rows x 32 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bc_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>diagnosis</th>\n",
       "      <th colspan=\"2\" halign=\"left\">smoothness_mean</th>\n",
       "      <th colspan=\"2\" halign=\"left\">compactness_mean</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>mean</th>\n",
       "      <th>median</th>\n",
       "      <th>mean</th>\n",
       "      <th>median</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>B</td>\n",
       "      <td>2.000321</td>\n",
       "      <td>1.8510</td>\n",
       "      <td>0.021438</td>\n",
       "      <td>0.01631</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M</td>\n",
       "      <td>4.323929</td>\n",
       "      <td>3.6795</td>\n",
       "      <td>0.032281</td>\n",
       "      <td>0.02859</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  diagnosis smoothness_mean         compactness_mean         \n",
       "                       mean  median             mean   median\n",
       "0         B        2.000321  1.8510         0.021438  0.01631\n",
       "1         M        4.323929  3.6795         0.032281  0.02859"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get mean and median of compactness and smoothness mean for benign and malignant tumors\n",
    "bc_data.groupby('diagnosis')[['smoothness_mean','compactness_mean']].agg(['mean', 'median']).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Ttest_indResult(statistic=-15.934158019257902, pvalue=1.6519051758498057e-47)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from scipy.stats import ttest_ind\n",
    "b = bc_data[bc_data['diagnosis']=='B']\n",
    "m = bc_data[bc_data['diagnosis']=='M']\n",
    "ttest_ind(b['smoothness_mean'], m['smoothness_mean'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# function to generate bootstrap samples\n",
    "from sklearn.utils import resample\n",
    "def generate_bootstrap_samples(data, target_col, n_samples, n_bootstraps=10):\n",
    "    bootstraps = []\n",
    "    for x in range(n_bootstraps):\n",
    "        bootstrap = resample(data, data['{}'.format(target_col)], n_samples=n_samples)\n",
    "        bootstraps.append(bootstrap)\n",
    "    # return resampled bootstraps\n",
    "    return bootstraps"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Exploratory Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\" seamless=\"seamless\" src=\"https://plot.ly/~sjstebbins/45.embed\" height=\"525px\" width=\"100%\"></iframe>"
      ],
      "text/plain": [
       "<plotly.tools.PlotlyDisplay object>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_selection import SelectKBest\n",
    "from sklearn.feature_selection import chi2\n",
    "from plotly import __version__\n",
    "from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot\n",
    "import plotly.plotly as py\n",
    "import plotly.graph_objs as go\n",
    "\n",
    "test = SelectKBest(score_func=chi2)\n",
    "fit = test.fit(data, target)\n",
    "features = sorted(zip(data.columns, fit.scores_))\n",
    "features = pd.DataFrame(features, columns=['feature', 'score']).sort_values('score', ascending=False)\n",
    "trace = go.Bar(x=features['feature'], y=features['score'],\n",
    "               marker=dict(color='red'),\n",
    "               error_y=dict(visible=True),\n",
    "               opacity=0.5\n",
    "              )\n",
    "\n",
    "layout = go.Layout(title=\"Feature Score\")\n",
    "fig = go.Figure(data=[trace], layout=layout)\n",
    "py.iplot(fig)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.98204467,  0.01617649,  0.00155751])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# PCA indicates that first PC explains 98% of the variance in the data\n",
    "from sklearn.decomposition import PCA\n",
    "pca = PCA(n_components=3)\n",
    "fit = pca.fit(data)\n",
    "fit.explained_variance_ratio_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Modeling"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### model 1: random forest \n",
    "The problem with using decision trees is that a small change in the data may result in very different splits and their individual predictive acurracy may not be very not the great. By using a random forest classifier however and ensembling many trees, we avoid overfitting through adding randomness in the feature selection and training set selection and taking a measured vote. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=3, error_score='raise',\n",
       "       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
       "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
       "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "            min_samples_leaf=1, min_samples_split=2,\n",
       "            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n",
       "            oob_score=False, random_state=None, verbose=0,\n",
       "            warm_start=False),\n",
       "       fit_params=None, iid=True, n_jobs=1,\n",
       "       param_grid={'n_estimators': [10, 50, 100], 'criterion': ['gini', 'entropy'], 'min_samples_leaf': range(1, 10)},\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
       "       scoring='accuracy', verbose=0)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# fit model\n",
    "from sklearn import ensemble\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "randomForest = ensemble.RandomForestClassifier()\n",
    "parameters = {\"n_estimators\": [10, 50, 100], \"criterion\": [\"gini\", \"entropy\"], \\\n",
    "                    \"min_samples_leaf\": range(1, 10)}\n",
    "grid_search_forest = GridSearchCV(randomForest, parameters, scoring='accuracy', cv=3)\n",
    "grid_search_forest.fit(data, target)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.961335676625659"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# measure model accuracy\n",
    "grid_search_forest.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'criterion': 'gini', 'min_samples_leaf': 2, 'n_estimators': 50}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# best params\n",
    "grid_search_forest.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\" seamless=\"seamless\" src=\"https://plot.ly/~sjstebbins/47.embed\" height=\"525px\" width=\"100%\"></iframe>"
      ],
      "text/plain": [
       "<plotly.tools.PlotlyDisplay object>"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# feature importance\n",
    "best_forest = ensemble.RandomForestClassifier(**grid_search_forest.best_params_)\n",
    "best_forest.fit(data, target)\n",
    "importances = sorted(zip(data.columns, best_forest.feature_importances_))\n",
    "importances = pd.DataFrame(importances, columns=['feature', 'importance']).sort_values('importance', ascending=False)\n",
    "trace = go.Bar(x=importances['feature'], y=importances['importance'],\n",
    "               marker=dict(color='red'),\n",
    "               error_y=dict(visible=True),\n",
    "               opacity=0.5\n",
    "              )\n",
    "\n",
    "layout = go.Layout(title=\"Feature importances\")\n",
    "fig = go.Figure(data=[trace], layout=layout)\n",
    "py.iplot(fig)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### model 2: support vector machine\n",
    "The advantage of an SVM is there is no local minima, they can be more efficient because it uses a subset of training points and you can build in expert knowledge about the problem via engineering the kernel. One disadvantage is that although SVM will find the an often high accuracy, it is very computational intensive. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=3, error_score='raise',\n",
       "       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
       "  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',\n",
       "  max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
       "  tol=0.001, verbose=False),\n",
       "       fit_params=None, iid=True, n_jobs=1,\n",
       "       param_grid={'C': [1, 10, 100, 1000], 'kernel': ('linear', 'rbf')},\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
       "       scoring=None, verbose=0)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# fit model - try some param tuning using grid search\n",
    "from sklearn import svm\n",
    "parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10, 100, 1000]}\n",
    "svc = svm.SVC()\n",
    "grid_search_svc = GridSearchCV(svc, parameters, cv=3)\n",
    "grid_search_svc.fit(data, target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.95782073813708257"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# measure model accuracy\n",
    "grid_search_svc.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'C': 100, 'kernel': 'linear'}"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# best params\n",
    "grid_search_svc.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\" seamless=\"seamless\" src=\"https://plot.ly/~sjstebbins/49.embed\" height=\"525px\" width=\"100%\"></iframe>"
      ],
      "text/plain": [
       "<plotly.tools.PlotlyDisplay object>"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# feature importance\n",
    "best_svc = svm.SVC(**grid_search_svc.best_params_)\n",
    "best_svc.fit(data, target)\n",
    "importances = sorted(zip(data, best_svc.coef_.ravel()))\n",
    "importances = pd.DataFrame(importances, columns=['feature', 'importance']).sort_values('importance', ascending=False)\n",
    "trace = go.Bar(x=importances['feature'], y=importances['importance'],\n",
    "               marker=dict(color='red'),\n",
    "               error_y=dict(visible=True),\n",
    "               opacity=0.5\n",
    "              )\n",
    "\n",
    "layout = go.Layout(title=\"Feature importances\")\n",
    "fig = go.Figure(data=[trace], layout=layout)\n",
    "py.iplot(fig)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Explanation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Non-Technical:\n",
    "The model with the highest accuracy used to predict benign versus malignant tumors was the random forest classifier with a 96.5% accuracy. Random Forest work by using many random simple decision trees, which are weak learners individually, but when an average or voting majority of the resulting terminal nodes is taken across all the trees, the aggregate learner can be highly accurate. SVMs approach the problem of classifying data into two-classes in a direct way: they construct linear decision boundaries, by explicitly separating the data in two different classes as well as possible. The decision boundaries are called hyperplanes in the feature space which is the multidemensional space in which the model is fit and the data points live. What this means is that if you imagine a piece of paper has two dimensions (height and width) and you have a lot of points drawn on the paper at different heights and width, you can draw a line on the paper seperating any collection of dots into two groups. If the groups are in two classes (small, big) the model can seperate these two classes with extermely high sccuracy given teh flexibility to draw unique curvy lines etc.. (which is equivalent to having flexibility with the paramaters of the SVM model) If its a three dimensional box with points in space, you can divide up the the 3 dimensional feature space by having a sheet separate the 3D space into two similar classificaitons. This hyperplane at 2 and 3 dimensions can take place at any dimensional level and in the case of this model, we have built a dimensional space with as many dimensions as there are features in our the dataset (30 in all) and the model was able to seperate the benign and malignant tumors with a 95.8% accuracy given the flexibility in parameters we set(the flexibility to draw so many curves in your line etc.. ) The more accurate model in this case, the random forest model, also found that fractal_dimension_mean was the dimension or feature with the highest predicitve power towards whether a tumor would be malignant or benign. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Technical\n",
    "The limitations from a technical standpoint with my model are fairly straightforward. With the SVM, I could 1) perform dimensionality reduction using the PCA results or feature selection to speed up the training time 2) I could widen the gridsearch param ranges and experiement with feature creation as well, 3) I could explore further models and then ensemble the most accurate, yet least correlated ones to yield a potentially higher accuracy model wihotu overfitting. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## #Part II"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### Student 1's code submission:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#!/usr/bin/env python\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.cross_validation import cross_val_score\n",
    "\n",
    "# Load data\n",
    "d = pd.read_csv('./data/train.csv')\n",
    "\n",
    "\n",
    "# Setup data for prediction\n",
    "x1 = data.SalaryNormalized # using data variable here when it is hasn't been assigned yet - need to change d = above to data =\n",
    "x2 = pd.get_dummies(data.ContractType)\n",
    "\n",
    "# Setup model\n",
    "model = LinearRegression()\n",
    "\n",
    "# Evaluate model\n",
    "from sklearn.cross_validation import cross_val_score # do not need to import this module a second time\n",
    "from sklearn.cross_validation import train_test_split # this is an unecessary import as the module is never used\n",
    "scores = cross_val_score(model, x2, x1, cv=1, scoring='mean_absolute_error') # using cv=1 is not performing true cross validaiton. \n",
    "# Setting cv=1 uses only 1 Kfold, which partitions the original sample into 1 random sample of the original data that is equal in \n",
    "# size to the original data. Thus, cv=1 is the same as using the original sample.      \n",
    "print(scores.mean())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Review \n",
    "\n",
    "#### Code\n",
    "See inline comments above\n",
    "#### Methodology\n",
    "Methodology is fairly standard. However, student should be more mindful not to import unecessary or duplicate packages and should make sure to use tteh variable names they have assigned. \n",
    "#### Conceptual Understanding\n",
    "The student doesnt seem to grasp the concept that cross validation partitions the original data into subsections. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Student 2's code submission:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#!/usr/bin/env python\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.cross_validation import cross_val_score\n",
    "\n",
    "# Load data\n",
    "data = pd.read_csv('./data/train.csv')\n",
    "\n",
    "\n",
    "# Setup data for prediction\n",
    "y = data.SalaryNormalized\n",
    "X = pd.get_dummies(data.ContractType)\n",
    "\n",
    "# Setup model\n",
    "model = LinearRegression()\n",
    "\n",
    "# Evaluate model\n",
    "scores = cross_val_score(model, X, y, cv=5, scoring='mean_absolute_error') # variable assignment 'y' should match 'Y' above\n",
    "print(scores.mean())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Review \n",
    "\n",
    "#### Code\n",
    "See inline comments above\n",
    "#### Methodology\n",
    "The methodology here is pretty standard. Student should be maintain consistent variable assignment syntax - all caps or all lowercase etc..\n",
    "#### Conceptual Understanding\n",
    "Student seems to a have a succint understanding of LinearRegression function and cross validation"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [Root]",
   "language": "python",
   "name": "Python [Root]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}