Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Her Code Camp Notebooks
name: example-environment
channels:
- conda-forge
dependencies:
- python
- numpy
- graphviz
- pip:
- nbgitpuller
- sphinx-gallery
- pandas
- matplotlib
- sklearn
- pydotplus
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<h3>Building a Cancer Classifier using Random Forest</h3>"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<h4>1- Load The Required Packages</h4>"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"import pandas as pd #data manupilation\n",
"from sklearn.model_selection import train_test_split #splitting the data to train and test\n",
"from sklearn import tree #running a decision tree\n",
"from sklearn.ensemble import RandomForestClassifier #running a random forest\n",
"from sklearn import datasets #saved datasets\n",
"\n",
"from sklearn import metrics #assessing model performance\n",
"from sklearn.metrics import classification_report #assessing model performance\n",
"from sklearn.metrics import confusion_matrix #assessing model performance\n",
"import matplotlib.pyplot as plt #visualize model performance\n",
"\n",
"pd.set_option('display.max_columns', 30) #display all columns in your data"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<h4>2- Load The Data</h4>"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>mean radius</th>\n",
" <th>mean texture</th>\n",
" <th>mean perimeter</th>\n",
" <th>mean area</th>\n",
" <th>mean smoothness</th>\n",
" <th>mean compactness</th>\n",
" <th>mean concavity</th>\n",
" <th>mean concave points</th>\n",
" <th>mean symmetry</th>\n",
" <th>mean fractal dimension</th>\n",
" <th>radius error</th>\n",
" <th>texture error</th>\n",
" <th>perimeter error</th>\n",
" <th>area error</th>\n",
" <th>smoothness error</th>\n",
" <th>compactness error</th>\n",
" <th>concavity error</th>\n",
" <th>concave points error</th>\n",
" <th>symmetry error</th>\n",
" <th>fractal dimension error</th>\n",
" <th>worst radius</th>\n",
" <th>worst texture</th>\n",
" <th>worst perimeter</th>\n",
" <th>worst area</th>\n",
" <th>worst smoothness</th>\n",
" <th>worst compactness</th>\n",
" <th>worst concavity</th>\n",
" <th>worst concave points</th>\n",
" <th>worst symmetry</th>\n",
" <th>worst fractal dimension</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>17.99</td>\n",
" <td>10.38</td>\n",
" <td>122.80</td>\n",
" <td>1001.0</td>\n",
" <td>0.11840</td>\n",
" <td>0.27760</td>\n",
" <td>0.3001</td>\n",
" <td>0.14710</td>\n",
" <td>0.2419</td>\n",
" <td>0.07871</td>\n",
" <td>1.0950</td>\n",
" <td>0.9053</td>\n",
" <td>8.589</td>\n",
" <td>153.40</td>\n",
" <td>0.006399</td>\n",
" <td>0.04904</td>\n",
" <td>0.05373</td>\n",
" <td>0.01587</td>\n",
" <td>0.03003</td>\n",
" <td>0.006193</td>\n",
" <td>25.38</td>\n",
" <td>17.33</td>\n",
" <td>184.60</td>\n",
" <td>2019.0</td>\n",
" <td>0.1622</td>\n",
" <td>0.6656</td>\n",
" <td>0.7119</td>\n",
" <td>0.2654</td>\n",
" <td>0.4601</td>\n",
" <td>0.11890</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>20.57</td>\n",
" <td>17.77</td>\n",
" <td>132.90</td>\n",
" <td>1326.0</td>\n",
" <td>0.08474</td>\n",
" <td>0.07864</td>\n",
" <td>0.0869</td>\n",
" <td>0.07017</td>\n",
" <td>0.1812</td>\n",
" <td>0.05667</td>\n",
" <td>0.5435</td>\n",
" <td>0.7339</td>\n",
" <td>3.398</td>\n",
" <td>74.08</td>\n",
" <td>0.005225</td>\n",
" <td>0.01308</td>\n",
" <td>0.01860</td>\n",
" <td>0.01340</td>\n",
" <td>0.01389</td>\n",
" <td>0.003532</td>\n",
" <td>24.99</td>\n",
" <td>23.41</td>\n",
" <td>158.80</td>\n",
" <td>1956.0</td>\n",
" <td>0.1238</td>\n",
" <td>0.1866</td>\n",
" <td>0.2416</td>\n",
" <td>0.1860</td>\n",
" <td>0.2750</td>\n",
" <td>0.08902</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>19.69</td>\n",
" <td>21.25</td>\n",
" <td>130.00</td>\n",
" <td>1203.0</td>\n",
" <td>0.10960</td>\n",
" <td>0.15990</td>\n",
" <td>0.1974</td>\n",
" <td>0.12790</td>\n",
" <td>0.2069</td>\n",
" <td>0.05999</td>\n",
" <td>0.7456</td>\n",
" <td>0.7869</td>\n",
" <td>4.585</td>\n",
" <td>94.03</td>\n",
" <td>0.006150</td>\n",
" <td>0.04006</td>\n",
" <td>0.03832</td>\n",
" <td>0.02058</td>\n",
" <td>0.02250</td>\n",
" <td>0.004571</td>\n",
" <td>23.57</td>\n",
" <td>25.53</td>\n",
" <td>152.50</td>\n",
" <td>1709.0</td>\n",
" <td>0.1444</td>\n",
" <td>0.4245</td>\n",
" <td>0.4504</td>\n",
" <td>0.2430</td>\n",
" <td>0.3613</td>\n",
" <td>0.08758</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>11.42</td>\n",
" <td>20.38</td>\n",
" <td>77.58</td>\n",
" <td>386.1</td>\n",
" <td>0.14250</td>\n",
" <td>0.28390</td>\n",
" <td>0.2414</td>\n",
" <td>0.10520</td>\n",
" <td>0.2597</td>\n",
" <td>0.09744</td>\n",
" <td>0.4956</td>\n",
" <td>1.1560</td>\n",
" <td>3.445</td>\n",
" <td>27.23</td>\n",
" <td>0.009110</td>\n",
" <td>0.07458</td>\n",
" <td>0.05661</td>\n",
" <td>0.01867</td>\n",
" <td>0.05963</td>\n",
" <td>0.009208</td>\n",
" <td>14.91</td>\n",
" <td>26.50</td>\n",
" <td>98.87</td>\n",
" <td>567.7</td>\n",
" <td>0.2098</td>\n",
" <td>0.8663</td>\n",
" <td>0.6869</td>\n",
" <td>0.2575</td>\n",
" <td>0.6638</td>\n",
" <td>0.17300</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>20.29</td>\n",
" <td>14.34</td>\n",
" <td>135.10</td>\n",
" <td>1297.0</td>\n",
" <td>0.10030</td>\n",
" <td>0.13280</td>\n",
" <td>0.1980</td>\n",
" <td>0.10430</td>\n",
" <td>0.1809</td>\n",
" <td>0.05883</td>\n",
" <td>0.7572</td>\n",
" <td>0.7813</td>\n",
" <td>5.438</td>\n",
" <td>94.44</td>\n",
" <td>0.011490</td>\n",
" <td>0.02461</td>\n",
" <td>0.05688</td>\n",
" <td>0.01885</td>\n",
" <td>0.01756</td>\n",
" <td>0.005115</td>\n",
" <td>22.54</td>\n",
" <td>16.67</td>\n",
" <td>152.20</td>\n",
" <td>1575.0</td>\n",
" <td>0.1374</td>\n",
" <td>0.2050</td>\n",
" <td>0.4000</td>\n",
" <td>0.1625</td>\n",
" <td>0.2364</td>\n",
" <td>0.07678</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" mean radius mean texture mean perimeter mean area mean smoothness \\\n",
"0 17.99 10.38 122.80 1001.0 0.11840 \n",
"1 20.57 17.77 132.90 1326.0 0.08474 \n",
"2 19.69 21.25 130.00 1203.0 0.10960 \n",
"3 11.42 20.38 77.58 386.1 0.14250 \n",
"4 20.29 14.34 135.10 1297.0 0.10030 \n",
"\n",
" mean compactness mean concavity mean concave points mean symmetry \\\n",
"0 0.27760 0.3001 0.14710 0.2419 \n",
"1 0.07864 0.0869 0.07017 0.1812 \n",
"2 0.15990 0.1974 0.12790 0.2069 \n",
"3 0.28390 0.2414 0.10520 0.2597 \n",
"4 0.13280 0.1980 0.10430 0.1809 \n",
"\n",
" mean fractal dimension radius error texture error perimeter error \\\n",
"0 0.07871 1.0950 0.9053 8.589 \n",
"1 0.05667 0.5435 0.7339 3.398 \n",
"2 0.05999 0.7456 0.7869 4.585 \n",
"3 0.09744 0.4956 1.1560 3.445 \n",
"4 0.05883 0.7572 0.7813 5.438 \n",
"\n",
" area error smoothness error compactness error concavity error \\\n",
"0 153.40 0.006399 0.04904 0.05373 \n",
"1 74.08 0.005225 0.01308 0.01860 \n",
"2 94.03 0.006150 0.04006 0.03832 \n",
"3 27.23 0.009110 0.07458 0.05661 \n",
"4 94.44 0.011490 0.02461 0.05688 \n",
"\n",
" concave points error symmetry error fractal dimension error worst radius \\\n",
"0 0.01587 0.03003 0.006193 25.38 \n",
"1 0.01340 0.01389 0.003532 24.99 \n",
"2 0.02058 0.02250 0.004571 23.57 \n",
"3 0.01867 0.05963 0.009208 14.91 \n",
"4 0.01885 0.01756 0.005115 22.54 \n",
"\n",
" worst texture worst perimeter worst area worst smoothness worst compactness \\\n",
"0 17.33 184.60 2019.0 0.1622 0.6656 \n",
"1 23.41 158.80 1956.0 0.1238 0.1866 \n",
"2 25.53 152.50 1709.0 0.1444 0.4245 \n",
"3 26.50 98.87 567.7 0.2098 0.8663 \n",
"4 16.67 152.20 1575.0 0.1374 0.2050 \n",
"\n",
" worst concavity worst concave points worst symmetry worst fractal dimension \n",
"0 0.7119 0.2654 0.4601 0.11890 \n",
"1 0.2416 0.1860 0.2750 0.08902 \n",
"2 0.4504 0.2430 0.3613 0.08758 \n",
"3 0.6869 0.2575 0.6638 0.17300 \n",
"4 0.4000 0.1625 0.2364 0.07678 "
]
},
"execution_count": 2,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"cancer =\n",
"X= #define your features\n",
"Y= #define the target variable\n",
"X.head() #view the first few rows from your features"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(569, 30)\n"
]
}
],
"source": [
"#print the dimensions of the dataset\n",
"print()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"MultiIndex([( 'mean radius',),\n",
" ( 'mean texture',),\n",
" ( 'mean perimeter',),\n",
" ( 'mean area',),\n",
" ( 'mean smoothness',),\n",
" ( 'mean compactness',),\n",
" ( 'mean concavity',),\n",
" ( 'mean concave points',),\n",
" ( 'mean symmetry',),\n",
" ( 'mean fractal dimension',),\n",
" ( 'radius error',),\n",
" ( 'texture error',),\n",
" ( 'perimeter error',),\n",
" ( 'area error',),\n",
" ( 'smoothness error',),\n",
" ( 'compactness error',),\n",
" ( 'concavity error',),\n",
" ( 'concave points error',),\n",
" ( 'symmetry error',),\n",
" ('fractal dimension error',),\n",
" ( 'worst radius',),\n",
" ( 'worst texture',),\n",
" ( 'worst perimeter',),\n",
" ( 'worst area',),\n",
" ( 'worst smoothness',),\n",
" ( 'worst compactness',),\n",
" ( 'worst concavity',),\n",
" ( 'worst concave points',),\n",
" ( 'worst symmetry',),\n",
" ('worst fractal dimension',)],\n",
" )"
]
},
"execution_count": 4,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#let's look at column names\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>mean radius</th>\n",
" <th>mean texture</th>\n",
" <th>mean perimeter</th>\n",
" <th>mean area</th>\n",
" <th>mean smoothness</th>\n",
" <th>mean compactness</th>\n",
" <th>mean concavity</th>\n",
" <th>mean concave points</th>\n",
" <th>mean symmetry</th>\n",
" <th>mean fractal dimension</th>\n",
" <th>radius error</th>\n",
" <th>texture error</th>\n",
" <th>perimeter error</th>\n",
" <th>area error</th>\n",
" <th>smoothness error</th>\n",
" <th>compactness error</th>\n",
" <th>concavity error</th>\n",
" <th>concave points error</th>\n",
" <th>symmetry error</th>\n",
" <th>fractal dimension error</th>\n",
" <th>worst radius</th>\n",
" <th>worst texture</th>\n",
" <th>worst perimeter</th>\n",
" <th>worst area</th>\n",
" <th>worst smoothness</th>\n",
" <th>worst compactness</th>\n",
" <th>worst concavity</th>\n",
" <th>worst concave points</th>\n",
" <th>worst symmetry</th>\n",
" <th>worst fractal dimension</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>count</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean</td>\n",
" <td>14.127292</td>\n",
" <td>19.289649</td>\n",
" <td>91.969033</td>\n",
" <td>654.889104</td>\n",
" <td>0.096360</td>\n",
" <td>0.104341</td>\n",
" <td>0.088799</td>\n",
" <td>0.048919</td>\n",
" <td>0.181162</td>\n",
" <td>0.062798</td>\n",
" <td>0.405172</td>\n",
" <td>1.216853</td>\n",
" <td>2.866059</td>\n",
" <td>40.337079</td>\n",
" <td>0.007041</td>\n",
" <td>0.025478</td>\n",
" <td>0.031894</td>\n",
" <td>0.011796</td>\n",
" <td>0.020542</td>\n",
" <td>0.003795</td>\n",
" <td>16.269190</td>\n",
" <td>25.677223</td>\n",
" <td>107.261213</td>\n",
" <td>880.583128</td>\n",
" <td>0.132369</td>\n",
" <td>0.254265</td>\n",
" <td>0.272188</td>\n",
" <td>0.114606</td>\n",
" <td>0.290076</td>\n",
" <td>0.083946</td>\n",
" </tr>\n",
" <tr>\n",
" <td>std</td>\n",
" <td>3.524049</td>\n",
" <td>4.301036</td>\n",
" <td>24.298981</td>\n",
" <td>351.914129</td>\n",
" <td>0.014064</td>\n",
" <td>0.052813</td>\n",
" <td>0.079720</td>\n",
" <td>0.038803</td>\n",
" <td>0.027414</td>\n",
" <td>0.007060</td>\n",
" <td>0.277313</td>\n",
" <td>0.551648</td>\n",
" <td>2.021855</td>\n",
" <td>45.491006</td>\n",
" <td>0.003003</td>\n",
" <td>0.017908</td>\n",
" <td>0.030186</td>\n",
" <td>0.006170</td>\n",
" <td>0.008266</td>\n",
" <td>0.002646</td>\n",
" <td>4.833242</td>\n",
" <td>6.146258</td>\n",
" <td>33.602542</td>\n",
" <td>569.356993</td>\n",
" <td>0.022832</td>\n",
" <td>0.157336</td>\n",
" <td>0.208624</td>\n",
" <td>0.065732</td>\n",
" <td>0.061867</td>\n",
" <td>0.018061</td>\n",
" </tr>\n",
" <tr>\n",
" <td>min</td>\n",
" <td>6.981000</td>\n",
" <td>9.710000</td>\n",
" <td>43.790000</td>\n",
" <td>143.500000</td>\n",
" <td>0.052630</td>\n",
" <td>0.019380</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.106000</td>\n",
" <td>0.049960</td>\n",
" <td>0.111500</td>\n",
" <td>0.360200</td>\n",
" <td>0.757000</td>\n",
" <td>6.802000</td>\n",
" <td>0.001713</td>\n",
" <td>0.002252</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.007882</td>\n",
" <td>0.000895</td>\n",
" <td>7.930000</td>\n",
" <td>12.020000</td>\n",
" <td>50.410000</td>\n",
" <td>185.200000</td>\n",
" <td>0.071170</td>\n",
" <td>0.027290</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.156500</td>\n",
" <td>0.055040</td>\n",
" </tr>\n",
" <tr>\n",
" <td>25%</td>\n",
" <td>11.700000</td>\n",
" <td>16.170000</td>\n",
" <td>75.170000</td>\n",
" <td>420.300000</td>\n",
" <td>0.086370</td>\n",
" <td>0.064920</td>\n",
" <td>0.029560</td>\n",
" <td>0.020310</td>\n",
" <td>0.161900</td>\n",
" <td>0.057700</td>\n",
" <td>0.232400</td>\n",
" <td>0.833900</td>\n",
" <td>1.606000</td>\n",
" <td>17.850000</td>\n",
" <td>0.005169</td>\n",
" <td>0.013080</td>\n",
" <td>0.015090</td>\n",
" <td>0.007638</td>\n",
" <td>0.015160</td>\n",
" <td>0.002248</td>\n",
" <td>13.010000</td>\n",
" <td>21.080000</td>\n",
" <td>84.110000</td>\n",
" <td>515.300000</td>\n",
" <td>0.116600</td>\n",
" <td>0.147200</td>\n",
" <td>0.114500</td>\n",
" <td>0.064930</td>\n",
" <td>0.250400</td>\n",
" <td>0.071460</td>\n",
" </tr>\n",
" <tr>\n",
" <td>50%</td>\n",
" <td>13.370000</td>\n",
" <td>18.840000</td>\n",
" <td>86.240000</td>\n",
" <td>551.100000</td>\n",
" <td>0.095870</td>\n",
" <td>0.092630</td>\n",
" <td>0.061540</td>\n",
" <td>0.033500</td>\n",
" <td>0.179200</td>\n",
" <td>0.061540</td>\n",
" <td>0.324200</td>\n",
" <td>1.108000</td>\n",
" <td>2.287000</td>\n",
" <td>24.530000</td>\n",
" <td>0.006380</td>\n",
" <td>0.020450</td>\n",
" <td>0.025890</td>\n",
" <td>0.010930</td>\n",
" <td>0.018730</td>\n",
" <td>0.003187</td>\n",
" <td>14.970000</td>\n",
" <td>25.410000</td>\n",
" <td>97.660000</td>\n",
" <td>686.500000</td>\n",
" <td>0.131300</td>\n",
" <td>0.211900</td>\n",
" <td>0.226700</td>\n",
" <td>0.099930</td>\n",
" <td>0.282200</td>\n",
" <td>0.080040</td>\n",
" </tr>\n",
" <tr>\n",
" <td>75%</td>\n",
" <td>15.780000</td>\n",
" <td>21.800000</td>\n",
" <td>104.100000</td>\n",
" <td>782.700000</td>\n",
" <td>0.105300</td>\n",
" <td>0.130400</td>\n",
" <td>0.130700</td>\n",
" <td>0.074000</td>\n",
" <td>0.195700</td>\n",
" <td>0.066120</td>\n",
" <td>0.478900</td>\n",
" <td>1.474000</td>\n",
" <td>3.357000</td>\n",
" <td>45.190000</td>\n",
" <td>0.008146</td>\n",
" <td>0.032450</td>\n",
" <td>0.042050</td>\n",
" <td>0.014710</td>\n",
" <td>0.023480</td>\n",
" <td>0.004558</td>\n",
" <td>18.790000</td>\n",
" <td>29.720000</td>\n",
" <td>125.400000</td>\n",
" <td>1084.000000</td>\n",
" <td>0.146000</td>\n",
" <td>0.339100</td>\n",
" <td>0.382900</td>\n",
" <td>0.161400</td>\n",
" <td>0.317900</td>\n",
" <td>0.092080</td>\n",
" </tr>\n",
" <tr>\n",
" <td>max</td>\n",
" <td>28.110000</td>\n",
" <td>39.280000</td>\n",
" <td>188.500000</td>\n",
" <td>2501.000000</td>\n",
" <td>0.163400</td>\n",
" <td>0.345400</td>\n",
" <td>0.426800</td>\n",
" <td>0.201200</td>\n",
" <td>0.304000</td>\n",
" <td>0.097440</td>\n",
" <td>2.873000</td>\n",
" <td>4.885000</td>\n",
" <td>21.980000</td>\n",
" <td>542.200000</td>\n",
" <td>0.031130</td>\n",
" <td>0.135400</td>\n",
" <td>0.396000</td>\n",
" <td>0.052790</td>\n",
" <td>0.078950</td>\n",
" <td>0.029840</td>\n",
" <td>36.040000</td>\n",
" <td>49.540000</td>\n",
" <td>251.200000</td>\n",
" <td>4254.000000</td>\n",
" <td>0.222600</td>\n",
" <td>1.058000</td>\n",
" <td>1.252000</td>\n",
" <td>0.291000</td>\n",
" <td>0.663800</td>\n",
" <td>0.207500</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" mean radius mean texture mean perimeter mean area mean smoothness \\\n",
"count 569.000000 569.000000 569.000000 569.000000 569.000000 \n",
"mean 14.127292 19.289649 91.969033 654.889104 0.096360 \n",
"std 3.524049 4.301036 24.298981 351.914129 0.014064 \n",
"min 6.981000 9.710000 43.790000 143.500000 0.052630 \n",
"25% 11.700000 16.170000 75.170000 420.300000 0.086370 \n",
"50% 13.370000 18.840000 86.240000 551.100000 0.095870 \n",
"75% 15.780000 21.800000 104.100000 782.700000 0.105300 \n",
"max 28.110000 39.280000 188.500000 2501.000000 0.163400 \n",
"\n",
" mean compactness mean concavity mean concave points mean symmetry \\\n",
"count 569.000000 569.000000 569.000000 569.000000 \n",
"mean 0.104341 0.088799 0.048919 0.181162 \n",
"std 0.052813 0.079720 0.038803 0.027414 \n",
"min 0.019380 0.000000 0.000000 0.106000 \n",
"25% 0.064920 0.029560 0.020310 0.161900 \n",
"50% 0.092630 0.061540 0.033500 0.179200 \n",
"75% 0.130400 0.130700 0.074000 0.195700 \n",
"max 0.345400 0.426800 0.201200 0.304000 \n",
"\n",
" mean fractal dimension radius error texture error perimeter error \\\n",
"count 569.000000 569.000000 569.000000 569.000000 \n",
"mean 0.062798 0.405172 1.216853 2.866059 \n",
"std 0.007060 0.277313 0.551648 2.021855 \n",
"min 0.049960 0.111500 0.360200 0.757000 \n",
"25% 0.057700 0.232400 0.833900 1.606000 \n",
"50% 0.061540 0.324200 1.108000 2.287000 \n",
"75% 0.066120 0.478900 1.474000 3.357000 \n",
"max 0.097440 2.873000 4.885000 21.980000 \n",
"\n",
" area error smoothness error compactness error concavity error \\\n",
"count 569.000000 569.000000 569.000000 569.000000 \n",
"mean 40.337079 0.007041 0.025478 0.031894 \n",
"std 45.491006 0.003003 0.017908 0.030186 \n",
"min 6.802000 0.001713 0.002252 0.000000 \n",
"25% 17.850000 0.005169 0.013080 0.015090 \n",
"50% 24.530000 0.006380 0.020450 0.025890 \n",
"75% 45.190000 0.008146 0.032450 0.042050 \n",
"max 542.200000 0.031130 0.135400 0.396000 \n",
"\n",
" concave points error symmetry error fractal dimension error \\\n",
"count 569.000000 569.000000 569.000000 \n",
"mean 0.011796 0.020542 0.003795 \n",
"std 0.006170 0.008266 0.002646 \n",
"min 0.000000 0.007882 0.000895 \n",
"25% 0.007638 0.015160 0.002248 \n",
"50% 0.010930 0.018730 0.003187 \n",
"75% 0.014710 0.023480 0.004558 \n",
"max 0.052790 0.078950 0.029840 \n",
"\n",
" worst radius worst texture worst perimeter worst area \\\n",
"count 569.000000 569.000000 569.000000 569.000000 \n",
"mean 16.269190 25.677223 107.261213 880.583128 \n",
"std 4.833242 6.146258 33.602542 569.356993 \n",
"min 7.930000 12.020000 50.410000 185.200000 \n",
"25% 13.010000 21.080000 84.110000 515.300000 \n",
"50% 14.970000 25.410000 97.660000 686.500000 \n",
"75% 18.790000 29.720000 125.400000 1084.000000 \n",
"max 36.040000 49.540000 251.200000 4254.000000 \n",
"\n",
" worst smoothness worst compactness worst concavity worst concave points \\\n",
"count 569.000000 569.000000 569.000000 569.000000 \n",
"mean 0.132369 0.254265 0.272188 0.114606 \n",
"std 0.022832 0.157336 0.208624 0.065732 \n",
"min 0.071170 0.027290 0.000000 0.000000 \n",
"25% 0.116600 0.147200 0.114500 0.064930 \n",
"50% 0.131300 0.211900 0.226700 0.099930 \n",
"75% 0.146000 0.339100 0.382900 0.161400 \n",
"max 0.222600 1.058000 1.252000 0.291000 \n",
"\n",
" worst symmetry worst fractal dimension \n",
"count 569.000000 569.000000 \n",
"mean 0.290076 0.083946 \n",
"std 0.061867 0.018061 \n",
"min 0.156500 0.055040 \n",
"25% 0.250400 0.071460 \n",
"50% 0.282200 0.080040 \n",
"75% 0.317900 0.092080 \n",
"max 0.663800 0.207500 "
]
},
"execution_count": 5,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#let's summarize the data\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<h4>3- Split to Train and Test</h4>"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(398, 30)\n",
"(171, 30)\n",
"(398,)\n",
"(171,)\n"
]
}
],
"source": [
"#split the data to 70% train and 30% test\n",
"x_train,x_test,y_train,y_test = \n",
"\n",
"print(x_train.shape)\n",
"print(x_test.shape)\n",
"print(y_train.shape)\n",
"print(y_test.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<h4>4- Train your model: Random Forest</h4>"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.9773869346733668"
]
},
"execution_count": 7,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"rf_model = #define the model\n",
" #fit the model (train)\n",
" #predict on new observations\n",
"\n",
"#what is the accuracy of this model?"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"Let's visualize this tree! (https://medium.com/@rnbrown/creating-and-visualizing-decision-trees-with-python-f8e8fa394176)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/sklearn/externals/six.py:31: DeprecationWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).\n",
" \"(https://pypi.org/project/six/).\", DeprecationWarning)\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<IPython.core.display.Image object>"
]
},
"execution_count": 8,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#select which tree do you want to visualize\n",
"selected_tree=2\n",
"\n",
"from sklearn.externals.six import StringIO\n",
"from IPython.display import Image\n",
"from sklearn.tree import export_graphviz\n",
"import pydotplus\n",
"dot_data2 = StringIO()\n",
"export_graphviz(rf_model.estimators_[selected_tree],\n",
" out_file=dot_data2,\n",
" filled=True,\n",
" precision=2,\n",
" feature_names=x_train.columns,\n",
" rounded=True)\n",
"graph = pydotplus.graph_from_dot_data(dot_data2.getvalue())\n",
"Image(graph.create_png())"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<h4>5- Predict!</h4>"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>mean radius</th>\n",
" <th>mean texture</th>\n",
" <th>mean perimeter</th>\n",
" <th>mean area</th>\n",
" <th>mean smoothness</th>\n",
" <th>mean compactness</th>\n",
" <th>mean concavity</th>\n",
" <th>mean concave points</th>\n",
" <th>mean symmetry</th>\n",
" <th>mean fractal dimension</th>\n",
" <th>radius error</th>\n",
" <th>texture error</th>\n",
" <th>perimeter error</th>\n",
" <th>area error</th>\n",
" <th>smoothness error</th>\n",
" <th>compactness error</th>\n",
" <th>concavity error</th>\n",
" <th>concave points error</th>\n",
" <th>symmetry error</th>\n",
" <th>fractal dimension error</th>\n",
" <th>worst radius</th>\n",
" <th>worst texture</th>\n",
" <th>worst perimeter</th>\n",
" <th>worst area</th>\n",
" <th>worst smoothness</th>\n",
" <th>worst compactness</th>\n",
" <th>worst concavity</th>\n",
" <th>worst concave points</th>\n",
" <th>worst symmetry</th>\n",
" <th>worst fractal dimension</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>204</td>\n",
" <td>12.47</td>\n",
" <td>18.6</td>\n",
" <td>81.09</td>\n",
" <td>481.9</td>\n",
" <td>0.09965</td>\n",
" <td>0.1058</td>\n",
" <td>0.08005</td>\n",
" <td>0.03821</td>\n",
" <td>0.1925</td>\n",
" <td>0.06373</td>\n",
" <td>0.3961</td>\n",
" <td>1.044</td>\n",
" <td>2.497</td>\n",
" <td>30.29</td>\n",
" <td>0.006953</td>\n",
" <td>0.01911</td>\n",
" <td>0.02701</td>\n",
" <td>0.01037</td>\n",
" <td>0.01782</td>\n",
" <td>0.003586</td>\n",
" <td>14.97</td>\n",
" <td>24.64</td>\n",
" <td>96.05</td>\n",
" <td>677.9</td>\n",
" <td>0.1426</td>\n",
" <td>0.2378</td>\n",
" <td>0.2671</td>\n",
" <td>0.1015</td>\n",
" <td>0.3014</td>\n",
" <td>0.0875</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" mean radius mean texture mean perimeter mean area mean smoothness \\\n",
"204 12.47 18.6 81.09 481.9 0.09965 \n",
"\n",
" mean compactness mean concavity mean concave points mean symmetry \\\n",
"204 0.1058 0.08005 0.03821 0.1925 \n",
"\n",
" mean fractal dimension radius error texture error perimeter error \\\n",
"204 0.06373 0.3961 1.044 2.497 \n",
"\n",
" area error smoothness error compactness error concavity error \\\n",
"204 30.29 0.006953 0.01911 0.02701 \n",
"\n",
" concave points error symmetry error fractal dimension error worst radius \\\n",
"204 0.01037 0.01782 0.003586 14.97 \n",
"\n",
" worst texture worst perimeter worst area worst smoothness \\\n",
"204 24.64 96.05 677.9 0.1426 \n",
"\n",
" worst compactness worst concavity worst concave points worst symmetry \\\n",
"204 0.2378 0.2671 0.1015 0.3014 \n",
"\n",
" worst fractal dimension \n",
"204 0.0875 "
]
},
"execution_count": 9,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#let's pull information from one patient from the test set\n",
"patient1_test=\n",
"patient1_test"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([1])"
]
},
"execution_count": 10,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#what would our model predict? Malignant or Benign?\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[0.02191104, 0.97808896]])"
]
},
"execution_count": 11,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#can we predict the probability of a patient being malignant or benign?\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,\n",
" 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,\n",
" 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,\n",
" 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,\n",
" 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1,\n",
" 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1])"
]
},
"execution_count": 12,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#Can we predict multiple patients at once?\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[0.02191104, 0.97808896],\n",
" [0.98740914, 0.01259086],\n",
" [0.98740914, 0.01259086],\n",
" [0.01700678, 0.98299322],\n",
" [0.03095799, 0.96904201],\n",
" [0.99043945, 0.00956055],\n",
" [0.99043945, 0.00956055],\n",
" [0.97675077, 0.02324923],\n",
" [0.37698711, 0.62301289],\n",
" [0.01474356, 0.98525644]])"
]
},
"execution_count": 13,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#can we get the probability of each test case being malignant or benign? (display the first 10 lines)\n",
"\n",
"\n",
"#do you see how the 0 and 1 were generated in the previous command?"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<h4>6- How well did we predict?</h4>"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.9649122807017544"
]
},
"execution_count": 14,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#what is the accuracy of the model on the test set?\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>predicted benign</th>\n",
" <th>predicted malignant</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>benign</td>\n",
" <td>58</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>malignant</td>\n",
" <td>1</td>\n",
" <td>107</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" predicted benign predicted malignant\n",
"benign 58 5\n",
"malignant 1 107"
]
},
"execution_count": 15,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#let's generate a confusion matrix!\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<h4>7- Identifying the important questions!</h4>"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"#let's create a data frame that contains information about how important each question is in generating the correct prediction!\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>importance</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>worst perimeter</td>\n",
" <td>0.350610</td>\n",
" </tr>\n",
" <tr>\n",
" <td>worst concavity</td>\n",
" <td>0.136664</td>\n",
" </tr>\n",
" <tr>\n",
" <td>worst concave points</td>\n",
" <td>0.120838</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean concave points</td>\n",
" <td>0.088673</td>\n",
" </tr>\n",
" <tr>\n",
" <td>worst area</td>\n",
" <td>0.055339</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean perimeter</td>\n",
" <td>0.053454</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean area</td>\n",
" <td>0.046570</td>\n",
" </tr>\n",
" <tr>\n",
" <td>worst radius</td>\n",
" <td>0.031904</td>\n",
" </tr>\n",
" <tr>\n",
" <td>worst smoothness</td>\n",
" <td>0.020717</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean concavity</td>\n",
" <td>0.019964</td>\n",
" </tr>\n",
" <tr>\n",
" <td>area error</td>\n",
" <td>0.012255</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean smoothness</td>\n",
" <td>0.007980</td>\n",
" </tr>\n",
" <tr>\n",
" <td>fractal dimension error</td>\n",
" <td>0.007435</td>\n",
" </tr>\n",
" <tr>\n",
" <td>worst fractal dimension</td>\n",
" <td>0.007163</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean radius</td>\n",
" <td>0.006920</td>\n",
" </tr>\n",
" <tr>\n",
" <td>worst texture</td>\n",
" <td>0.006043</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean texture</td>\n",
" <td>0.004531</td>\n",
" </tr>\n",
" <tr>\n",
" <td>radius error</td>\n",
" <td>0.003888</td>\n",
" </tr>\n",
" <tr>\n",
" <td>worst symmetry</td>\n",
" <td>0.003170</td>\n",
" </tr>\n",
" <tr>\n",
" <td>symmetry error</td>\n",
" <td>0.002774</td>\n",
" </tr>\n",
" <tr>\n",
" <td>concavity error</td>\n",
" <td>0.002694</td>\n",
" </tr>\n",
" <tr>\n",
" <td>worst compactness</td>\n",
" <td>0.002545</td>\n",
" </tr>\n",
" <tr>\n",
" <td>compactness error</td>\n",
" <td>0.002138</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean fractal dimension</td>\n",
" <td>0.002114</td>\n",
" </tr>\n",
" <tr>\n",
" <td>perimeter error</td>\n",
" <td>0.001663</td>\n",
" </tr>\n",
" <tr>\n",
" <td>concave points error</td>\n",
" <td>0.001098</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean compactness</td>\n",
" <td>0.000856</td>\n",
" </tr>\n",
" <tr>\n",
" <td>smoothness error</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>texture error</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean symmetry</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" importance\n",
"worst perimeter 0.350610\n",
"worst concavity 0.136664\n",
"worst concave points 0.120838\n",
"mean concave points 0.088673\n",
"worst area 0.055339\n",
"mean perimeter 0.053454\n",
"mean area 0.046570\n",
"worst radius 0.031904\n",
"worst smoothness 0.020717\n",
"mean concavity 0.019964\n",
"area error 0.012255\n",
"mean smoothness 0.007980\n",
"fractal dimension error 0.007435\n",
"worst fractal dimension 0.007163\n",
"mean radius 0.006920\n",
"worst texture 0.006043\n",
"mean texture 0.004531\n",
"radius error 0.003888\n",
"worst symmetry 0.003170\n",
"symmetry error 0.002774\n",
"concavity error 0.002694\n",
"worst compactness 0.002545\n",
"compactness error 0.002138\n",
"mean fractal dimension 0.002114\n",
"perimeter error 0.001663\n",
"concave points error 0.001098\n",
"mean compactness 0.000856\n",
"smoothness error 0.000000\n",
"texture error 0.000000\n",
"mean symmetry 0.000000"
]
},
"execution_count": 17,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#display the dataframe. Which questions do you think are important?\n",
"feature_importances"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<h4>8- Let's build another model with less features!</h4>"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>worst perimeter</th>\n",
" <th>worst concave points</th>\n",
" <th>worst radius</th>\n",
" <th>mean concave points</th>\n",
" <th>worst concavity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>184.60</td>\n",
" <td>0.2654</td>\n",
" <td>25.38</td>\n",
" <td>0.14710</td>\n",
" <td>0.7119</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>158.80</td>\n",
" <td>0.1860</td>\n",
" <td>24.99</td>\n",
" <td>0.07017</td>\n",
" <td>0.2416</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>152.50</td>\n",
" <td>0.2430</td>\n",
" <td>23.57</td>\n",
" <td>0.12790</td>\n",
" <td>0.4504</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>98.87</td>\n",
" <td>0.2575</td>\n",
" <td>14.91</td>\n",
" <td>0.10520</td>\n",
" <td>0.6869</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>152.20</td>\n",
" <td>0.1625</td>\n",
" <td>22.54</td>\n",
" <td>0.10430</td>\n",
" <td>0.4000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" worst perimeter worst concave points worst radius mean concave points \\\n",
"0 184.60 0.2654 25.38 0.14710 \n",
"1 158.80 0.1860 24.99 0.07017 \n",
"2 152.50 0.2430 23.57 0.12790 \n",
"3 98.87 0.2575 14.91 0.10520 \n",
"4 152.20 0.1625 22.54 0.10430 \n",
"\n",
" worst concavity \n",
"0 0.7119 \n",
"1 0.2416 \n",
"2 0.4504 \n",
"3 0.6869 \n",
"4 0.4000 "
]
},
"execution_count": 18,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#subset the questions we are interested in\n",
"X_reduced=X[['worst perimeter','worst concave points','worst radius','mean concave points','worst concavity']] #define your features\n",
"Y=pd.Series(cancer.target) #define the target\n",
"X_reduced.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"#split into train and test\n",
"x_train,x_test,y_train,y_test = train_test_split(X_reduced,Y,test_size=0.3,random_state=42)\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.9623115577889447"
]
},
"execution_count": 20,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#train a new model!\n",
"rf_model = RandomForestClassifier(max_depth=3,n_estimators=15) #define the model\n",
"rf_model.fit(x_train, y_train) #fit the model (train)\n",
"rf_model.score(x_train,y_train) #predict on new observations\n",
"\n",
"#what is the accuracy of this model?"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/sklearn/externals/joblib/__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n",
" warnings.warn(msg, category=DeprecationWarning)\n"
]
},
{
"data": {
"text/plain": [
"['cancer_classifier.pkl']"
]
},
"execution_count": 21,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#save the model!\n",
"from sklearn.externals import joblib\n",
"\n",
"joblib.dump(rf_model, \"cancer_classifier.pkl\") #save the whole model into a file to be used later\n",
"\n",
"#to load the model next time we just need to do:\n",
"#classifer = joblib.load(\"model.pkl\")\n",
"#classifer.predict(newobs)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<center><h3>Congratulations! You have built your first classifier!</h3></center>\n",
"<center><h5>www.thecodinghive.com</h5></center>"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (system-wide)",
"language": "python",
"metadata": {
"cocalc": {
"description": "Python 3 programming language",
"priority": 100,
"url": "https://www.python.org/"
}
},
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment