lbourbon/SVM.ipynb

## SVM.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# importa a biblioteca pandas\n",
    "import pandas as pd\n",
    "from sklearn.datasets import load_breast_cancer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['feature_names', 'target_names', 'DESCR', 'data', 'target'])"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# cria um objeto df ao qual é atribuído o dataframe com os dados de câncer de mama disponível na própria biblioteca Sklearn\n",
    "cancer = load_breast_cancer()\n",
    "# o objeto criado é um dicionário, com as seguintes chaves:\n",
    "cancer.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# cria o dataframe, usando as chaves do dicionário para pegar os dados e os nomes das colunas\n",
    "df = pd.DataFrame(cancer['data'], columns=cancer['feature_names'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean radius</th>\n",
       "      <th>mean texture</th>\n",
       "      <th>mean perimeter</th>\n",
       "      <th>mean area</th>\n",
       "      <th>mean smoothness</th>\n",
       "      <th>mean compactness</th>\n",
       "      <th>mean concavity</th>\n",
       "      <th>mean concave points</th>\n",
       "      <th>mean symmetry</th>\n",
       "      <th>mean fractal dimension</th>\n",
       "      <th>...</th>\n",
       "      <th>worst radius</th>\n",
       "      <th>worst texture</th>\n",
       "      <th>worst perimeter</th>\n",
       "      <th>worst area</th>\n",
       "      <th>worst smoothness</th>\n",
       "      <th>worst compactness</th>\n",
       "      <th>worst concavity</th>\n",
       "      <th>worst concave points</th>\n",
       "      <th>worst symmetry</th>\n",
       "      <th>worst fractal dimension</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>17.99</td>\n",
       "      <td>10.38</td>\n",
       "      <td>122.80</td>\n",
       "      <td>1001.0</td>\n",
       "      <td>0.11840</td>\n",
       "      <td>0.27760</td>\n",
       "      <td>0.3001</td>\n",
       "      <td>0.14710</td>\n",
       "      <td>0.2419</td>\n",
       "      <td>0.07871</td>\n",
       "      <td>...</td>\n",
       "      <td>25.38</td>\n",
       "      <td>17.33</td>\n",
       "      <td>184.60</td>\n",
       "      <td>2019.0</td>\n",
       "      <td>0.1622</td>\n",
       "      <td>0.6656</td>\n",
       "      <td>0.7119</td>\n",
       "      <td>0.2654</td>\n",
       "      <td>0.4601</td>\n",
       "      <td>0.11890</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>20.57</td>\n",
       "      <td>17.77</td>\n",
       "      <td>132.90</td>\n",
       "      <td>1326.0</td>\n",
       "      <td>0.08474</td>\n",
       "      <td>0.07864</td>\n",
       "      <td>0.0869</td>\n",
       "      <td>0.07017</td>\n",
       "      <td>0.1812</td>\n",
       "      <td>0.05667</td>\n",
       "      <td>...</td>\n",
       "      <td>24.99</td>\n",
       "      <td>23.41</td>\n",
       "      <td>158.80</td>\n",
       "      <td>1956.0</td>\n",
       "      <td>0.1238</td>\n",
       "      <td>0.1866</td>\n",
       "      <td>0.2416</td>\n",
       "      <td>0.1860</td>\n",
       "      <td>0.2750</td>\n",
       "      <td>0.08902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>19.69</td>\n",
       "      <td>21.25</td>\n",
       "      <td>130.00</td>\n",
       "      <td>1203.0</td>\n",
       "      <td>0.10960</td>\n",
       "      <td>0.15990</td>\n",
       "      <td>0.1974</td>\n",
       "      <td>0.12790</td>\n",
       "      <td>0.2069</td>\n",
       "      <td>0.05999</td>\n",
       "      <td>...</td>\n",
       "      <td>23.57</td>\n",
       "      <td>25.53</td>\n",
       "      <td>152.50</td>\n",
       "      <td>1709.0</td>\n",
       "      <td>0.1444</td>\n",
       "      <td>0.4245</td>\n",
       "      <td>0.4504</td>\n",
       "      <td>0.2430</td>\n",
       "      <td>0.3613</td>\n",
       "      <td>0.08758</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>11.42</td>\n",
       "      <td>20.38</td>\n",
       "      <td>77.58</td>\n",
       "      <td>386.1</td>\n",
       "      <td>0.14250</td>\n",
       "      <td>0.28390</td>\n",
       "      <td>0.2414</td>\n",
       "      <td>0.10520</td>\n",
       "      <td>0.2597</td>\n",
       "      <td>0.09744</td>\n",
       "      <td>...</td>\n",
       "      <td>14.91</td>\n",
       "      <td>26.50</td>\n",
       "      <td>98.87</td>\n",
       "      <td>567.7</td>\n",
       "      <td>0.2098</td>\n",
       "      <td>0.8663</td>\n",
       "      <td>0.6869</td>\n",
       "      <td>0.2575</td>\n",
       "      <td>0.6638</td>\n",
       "      <td>0.17300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>20.29</td>\n",
       "      <td>14.34</td>\n",
       "      <td>135.10</td>\n",
       "      <td>1297.0</td>\n",
       "      <td>0.10030</td>\n",
       "      <td>0.13280</td>\n",
       "      <td>0.1980</td>\n",
       "      <td>0.10430</td>\n",
       "      <td>0.1809</td>\n",
       "      <td>0.05883</td>\n",
       "      <td>...</td>\n",
       "      <td>22.54</td>\n",
       "      <td>16.67</td>\n",
       "      <td>152.20</td>\n",
       "      <td>1575.0</td>\n",
       "      <td>0.1374</td>\n",
       "      <td>0.2050</td>\n",
       "      <td>0.4000</td>\n",
       "      <td>0.1625</td>\n",
       "      <td>0.2364</td>\n",
       "      <td>0.07678</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 30 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \\\n",
       "0        17.99         10.38          122.80     1001.0          0.11840   \n",
       "1        20.57         17.77          132.90     1326.0          0.08474   \n",
       "2        19.69         21.25          130.00     1203.0          0.10960   \n",
       "3        11.42         20.38           77.58      386.1          0.14250   \n",
       "4        20.29         14.34          135.10     1297.0          0.10030   \n",
       "\n",
       "   mean compactness  mean concavity  mean concave points  mean symmetry  \\\n",
       "0           0.27760          0.3001              0.14710         0.2419   \n",
       "1           0.07864          0.0869              0.07017         0.1812   \n",
       "2           0.15990          0.1974              0.12790         0.2069   \n",
       "3           0.28390          0.2414              0.10520         0.2597   \n",
       "4           0.13280          0.1980              0.10430         0.1809   \n",
       "\n",
       "   mean fractal dimension           ...             worst radius  \\\n",
       "0                 0.07871           ...                    25.38   \n",
       "1                 0.05667           ...                    24.99   \n",
       "2                 0.05999           ...                    23.57   \n",
       "3                 0.09744           ...                    14.91   \n",
       "4                 0.05883           ...                    22.54   \n",
       "\n",
       "   worst texture  worst perimeter  worst area  worst smoothness  \\\n",
       "0          17.33           184.60      2019.0            0.1622   \n",
       "1          23.41           158.80      1956.0            0.1238   \n",
       "2          25.53           152.50      1709.0            0.1444   \n",
       "3          26.50            98.87       567.7            0.2098   \n",
       "4          16.67           152.20      1575.0            0.1374   \n",
       "\n",
       "   worst compactness  worst concavity  worst concave points  worst symmetry  \\\n",
       "0             0.6656           0.7119                0.2654          0.4601   \n",
       "1             0.1866           0.2416                0.1860          0.2750   \n",
       "2             0.4245           0.4504                0.2430          0.3613   \n",
       "3             0.8663           0.6869                0.2575          0.6638   \n",
       "4             0.2050           0.4000                0.1625          0.2364   \n",
       "\n",
       "   worst fractal dimension  \n",
       "0                  0.11890  \n",
       "1                  0.08902  \n",
       "2                  0.08758  \n",
       "3                  0.17300  \n",
       "4                  0.07678  \n",
       "\n",
       "[5 rows x 30 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# checa cabeçalho\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# importa o support vector classifier (SVC) do scikit-learn - que implementa o SVM\n",
    "from sklearn.svm import SVC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# divide o dataframe em X (atributos) e y (rótulo)\n",
    "X = df\n",
    "y = cancer['target']\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# divide os dados para treino (70%) e teste (30%)\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
       "  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n",
       "  max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
       "  tol=0.001, verbose=False)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# treino\n",
    "model = SVC()\n",
    "model.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# cria um objeto prev com as previsões realizadas pelo modelo\n",
    "prev = model.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# importa as bibliotecas para avaliar a eficácia do modelo\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.metrics import confusion_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.00      0.00      0.00        63\n",
      "          1       0.63      1.00      0.77       108\n",
      "\n",
      "avg / total       0.40      0.63      0.49       171\n",
      "\n",
      "[[  0  63]\n",
      " [  0 108]]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\lbour\\Anaconda3\\envs\\k35\\lib\\site-packages\\sklearn\\metrics\\classification.py:1113: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "# imprime os resultados obtidos \n",
    "# aqui a gente observa que os resultados foram péssimos, isso aconteceu porque não escolhemos argumentos adequados\n",
    "# para serem passados no nosso SVM, mas temos uma forma fácil de resolver isso\n",
    "print(classification_report(y_test, prev))\n",
    "print(confusion_matrix(y_test, prev))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# importa o GridSearchCV cuja função é tentar uma combinação de argumentos e escolher a melhor\n",
    "from sklearn.model_selection import GridSearchCV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# criamos um dicionário com os argumentos a serem testados\n",
    "param_grid = {'C':[.1, 1, 10, 100, 1000], 'gamma':[1, .1, .01, .001, .0001]}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# criamos um novo modelo, como se fosse algoritmo do sklearn e passamos o nosso dicionário\n",
    "grid = GridSearchCV(SVC(), param_grid, verbose=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 3 folds for each of 25 candidates, totalling 75 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    1.1s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=None, error_score='raise',\n",
       "       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
       "  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n",
       "  max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
       "  tol=0.001, verbose=False),\n",
       "       fit_params={}, iid=True, n_jobs=1,\n",
       "       param_grid={'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'C': [0.1, 1, 10, 100, 1000]},\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
       "       scoring=None, verbose=1)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# treinamos ele normalmente\n",
    "grid.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'C': 10, 'gamma': 0.0001}"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# melhores parâmetros:\n",
    "grid.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# fazemos as previsões e...\n",
    "grid_predictions = grid.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.98      0.89      0.93        63\n",
      "          1       0.94      0.99      0.96       108\n",
      "\n",
      "avg / total       0.95      0.95      0.95       171\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# 95% !!! Agora sim!\n",
    "print(classification_report(y_test, grid_predictions))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "###     Acabamos de criar, em poucos minutos, uma aplicação capaz de diagnosticar a malignidade de um câncer de mama com 95% de acurária!"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda env:k35]",
   "language": "python",
   "name": "conda-env-k35-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# importa a biblioteca pandas\n",
	"import pandas as pd\n",
	"from sklearn.datasets import load_breast_cancer"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"dict_keys(['feature_names', 'target_names', 'DESCR', 'data', 'target'])"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# cria um objeto df ao qual é atribuído o dataframe com os dados de câncer de mama disponível na própria biblioteca Sklearn\n",
	"cancer = load_breast_cancer()\n",
	"# o objeto criado é um dicionário, com as seguintes chaves:\n",
	"cancer.keys()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# cria o dataframe, usando as chaves do dicionário para pegar os dados e os nomes das colunas\n",
	"df = pd.DataFrame(cancer['data'], columns=cancer['feature_names'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>mean radius</th>\n",
	" <th>mean texture</th>\n",
	" <th>mean perimeter</th>\n",
	" <th>mean area</th>\n",
	" <th>mean smoothness</th>\n",
	" <th>mean compactness</th>\n",
	" <th>mean concavity</th>\n",
	" <th>mean concave points</th>\n",
	" <th>mean symmetry</th>\n",
	" <th>mean fractal dimension</th>\n",
	" <th>...</th>\n",
	" <th>worst radius</th>\n",
	" <th>worst texture</th>\n",
	" <th>worst perimeter</th>\n",
	" <th>worst area</th>\n",
	" <th>worst smoothness</th>\n",
	" <th>worst compactness</th>\n",
	" <th>worst concavity</th>\n",
	" <th>worst concave points</th>\n",
	" <th>worst symmetry</th>\n",
	" <th>worst fractal dimension</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>17.99</td>\n",
	" <td>10.38</td>\n",
	" <td>122.80</td>\n",
	" <td>1001.0</td>\n",
	" <td>0.11840</td>\n",
	" <td>0.27760</td>\n",
	" <td>0.3001</td>\n",
	" <td>0.14710</td>\n",
	" <td>0.2419</td>\n",
	" <td>0.07871</td>\n",
	" <td>...</td>\n",
	" <td>25.38</td>\n",
	" <td>17.33</td>\n",
	" <td>184.60</td>\n",
	" <td>2019.0</td>\n",
	" <td>0.1622</td>\n",
	" <td>0.6656</td>\n",
	" <td>0.7119</td>\n",
	" <td>0.2654</td>\n",
	" <td>0.4601</td>\n",
	" <td>0.11890</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>20.57</td>\n",
	" <td>17.77</td>\n",
	" <td>132.90</td>\n",
	" <td>1326.0</td>\n",
	" <td>0.08474</td>\n",
	" <td>0.07864</td>\n",
	" <td>0.0869</td>\n",
	" <td>0.07017</td>\n",
	" <td>0.1812</td>\n",
	" <td>0.05667</td>\n",
	" <td>...</td>\n",
	" <td>24.99</td>\n",
	" <td>23.41</td>\n",
	" <td>158.80</td>\n",
	" <td>1956.0</td>\n",
	" <td>0.1238</td>\n",
	" <td>0.1866</td>\n",
	" <td>0.2416</td>\n",
	" <td>0.1860</td>\n",
	" <td>0.2750</td>\n",
	" <td>0.08902</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>19.69</td>\n",
	" <td>21.25</td>\n",
	" <td>130.00</td>\n",
	" <td>1203.0</td>\n",
	" <td>0.10960</td>\n",
	" <td>0.15990</td>\n",
	" <td>0.1974</td>\n",
	" <td>0.12790</td>\n",
	" <td>0.2069</td>\n",
	" <td>0.05999</td>\n",
	" <td>...</td>\n",
	" <td>23.57</td>\n",
	" <td>25.53</td>\n",
	" <td>152.50</td>\n",
	" <td>1709.0</td>\n",
	" <td>0.1444</td>\n",
	" <td>0.4245</td>\n",
	" <td>0.4504</td>\n",
	" <td>0.2430</td>\n",
	" <td>0.3613</td>\n",
	" <td>0.08758</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>11.42</td>\n",
	" <td>20.38</td>\n",
	" <td>77.58</td>\n",
	" <td>386.1</td>\n",
	" <td>0.14250</td>\n",
	" <td>0.28390</td>\n",
	" <td>0.2414</td>\n",
	" <td>0.10520</td>\n",
	" <td>0.2597</td>\n",
	" <td>0.09744</td>\n",
	" <td>...</td>\n",
	" <td>14.91</td>\n",
	" <td>26.50</td>\n",
	" <td>98.87</td>\n",
	" <td>567.7</td>\n",
	" <td>0.2098</td>\n",
	" <td>0.8663</td>\n",
	" <td>0.6869</td>\n",
	" <td>0.2575</td>\n",
	" <td>0.6638</td>\n",
	" <td>0.17300</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>20.29</td>\n",
	" <td>14.34</td>\n",
	" <td>135.10</td>\n",
	" <td>1297.0</td>\n",
	" <td>0.10030</td>\n",
	" <td>0.13280</td>\n",
	" <td>0.1980</td>\n",
	" <td>0.10430</td>\n",
	" <td>0.1809</td>\n",
	" <td>0.05883</td>\n",
	" <td>...</td>\n",
	" <td>22.54</td>\n",
	" <td>16.67</td>\n",
	" <td>152.20</td>\n",
	" <td>1575.0</td>\n",
	" <td>0.1374</td>\n",
	" <td>0.2050</td>\n",
	" <td>0.4000</td>\n",
	" <td>0.1625</td>\n",
	" <td>0.2364</td>\n",
	" <td>0.07678</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>5 rows × 30 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" mean radius mean texture mean perimeter mean area mean smoothness \\\n",
	"0 17.99 10.38 122.80 1001.0 0.11840 \n",
	"1 20.57 17.77 132.90 1326.0 0.08474 \n",
	"2 19.69 21.25 130.00 1203.0 0.10960 \n",
	"3 11.42 20.38 77.58 386.1 0.14250 \n",
	"4 20.29 14.34 135.10 1297.0 0.10030 \n",
	"\n",
	" mean compactness mean concavity mean concave points mean symmetry \\\n",
	"0 0.27760 0.3001 0.14710 0.2419 \n",
	"1 0.07864 0.0869 0.07017 0.1812 \n",
	"2 0.15990 0.1974 0.12790 0.2069 \n",
	"3 0.28390 0.2414 0.10520 0.2597 \n",
	"4 0.13280 0.1980 0.10430 0.1809 \n",
	"\n",
	" mean fractal dimension ... worst radius \\\n",
	"0 0.07871 ... 25.38 \n",
	"1 0.05667 ... 24.99 \n",
	"2 0.05999 ... 23.57 \n",
	"3 0.09744 ... 14.91 \n",
	"4 0.05883 ... 22.54 \n",
	"\n",
	" worst texture worst perimeter worst area worst smoothness \\\n",
	"0 17.33 184.60 2019.0 0.1622 \n",
	"1 23.41 158.80 1956.0 0.1238 \n",
	"2 25.53 152.50 1709.0 0.1444 \n",
	"3 26.50 98.87 567.7 0.2098 \n",
	"4 16.67 152.20 1575.0 0.1374 \n",
	"\n",
	" worst compactness worst concavity worst concave points worst symmetry \\\n",
	"0 0.6656 0.7119 0.2654 0.4601 \n",
	"1 0.1866 0.2416 0.1860 0.2750 \n",
	"2 0.4245 0.4504 0.2430 0.3613 \n",
	"3 0.8663 0.6869 0.2575 0.6638 \n",
	"4 0.2050 0.4000 0.1625 0.2364 \n",
	"\n",
	" worst fractal dimension \n",
	"0 0.11890 \n",
	"1 0.08902 \n",
	"2 0.08758 \n",
	"3 0.17300 \n",
	"4 0.07678 \n",
	"\n",
	"[5 rows x 30 columns]"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# checa cabeçalho\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# importa o support vector classifier (SVC) do scikit-learn - que implementa o SVM\n",
	"from sklearn.svm import SVC"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# divide o dataframe em X (atributos) e y (rótulo)\n",
	"X = df\n",
	"y = cancer['target']\n",
	"\n",
	"from sklearn.model_selection import train_test_split\n",
	"\n",
	"# divide os dados para treino (70%) e teste (30%)\n",
	"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
	" decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n",
	" max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
	" tol=0.001, verbose=False)"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# treino\n",
	"model = SVC()\n",
	"model.fit(X_train, y_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# cria um objeto prev com as previsões realizadas pelo modelo\n",
	"prev = model.predict(X_test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# importa as bibliotecas para avaliar a eficácia do modelo\n",
	"from sklearn.metrics import classification_report\n",
	"from sklearn.metrics import confusion_matrix"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" precision recall f1-score support\n",
	"\n",
	" 0 0.00 0.00 0.00 63\n",
	" 1 0.63 1.00 0.77 108\n",
	"\n",
	"avg / total 0.40 0.63 0.49 171\n",
	"\n",
	"[[ 0 63]\n",
	" [ 0 108]]\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"C:\\Users\\lbour\\Anaconda3\\envs\\k35\\lib\\site-packages\\sklearn\\metrics\\classification.py:1113: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
	" 'precision', 'predicted', average, warn_for)\n"
	]
	}
	],
	"source": [
	"# imprime os resultados obtidos \n",
	"# aqui a gente observa que os resultados foram péssimos, isso aconteceu porque não escolhemos argumentos adequados\n",
	"# para serem passados no nosso SVM, mas temos uma forma fácil de resolver isso\n",
	"print(classification_report(y_test, prev))\n",
	"print(confusion_matrix(y_test, prev))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# importa o GridSearchCV cuja função é tentar uma combinação de argumentos e escolher a melhor\n",
	"from sklearn.model_selection import GridSearchCV"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# criamos um dicionário com os argumentos a serem testados\n",
	"param_grid = {'C':[.1, 1, 10, 100, 1000], 'gamma':[1, .1, .01, .001, .0001]}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# criamos um novo modelo, como se fosse algoritmo do sklearn e passamos o nosso dicionário\n",
	"grid = GridSearchCV(SVC(), param_grid, verbose=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Fitting 3 folds for each of 25 candidates, totalling 75 fits\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"[Parallel(n_jobs=1)]: Done 75 out of 75 \| elapsed: 1.1s finished\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"GridSearchCV(cv=None, error_score='raise',\n",
	" estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
	" decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n",
	" max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
	" tol=0.001, verbose=False),\n",
	" fit_params={}, iid=True, n_jobs=1,\n",
	" param_grid={'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'C': [0.1, 1, 10, 100, 1000]},\n",
	" pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
	" scoring=None, verbose=1)"
	]
	},
	"execution_count": 20,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# treinamos ele normalmente\n",
	"grid.fit(X_train, y_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'C': 10, 'gamma': 0.0001}"
	]
	},
	"execution_count": 21,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# melhores parâmetros:\n",
	"grid.best_params_"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# fazemos as previsões e...\n",
	"grid_predictions = grid.predict(X_test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" precision recall f1-score support\n",
	"\n",
	" 0 0.98 0.89 0.93 63\n",
	" 1 0.94 0.99 0.96 108\n",
	"\n",
	"avg / total 0.95 0.95 0.95 171\n",
	"\n"
	]
	}
	],
	"source": [
	"# 95% !!! Agora sim!\n",
	"print(classification_report(y_test, grid_predictions))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"### Acabamos de criar, em poucos minutos, uma aplicação capaz de diagnosticar a malignidade de um câncer de mama com 95% de acurária!"
	]
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python [conda env:k35]",
	"language": "python",
	"name": "conda-env-k35-py"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}