Skip to content

Instantly share code, notes, and snippets.

@lbourbon
Last active November 26, 2018 18:31
Show Gist options
  • Save lbourbon/928905e70b4837f78af8fe20f5aa2693 to your computer and use it in GitHub Desktop.
Save lbourbon/928905e70b4837f78af8fe20f5aa2693 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# importa a biblioteca pandas\n",
"import pandas as pd\n",
"from sklearn.datasets import load_breast_cancer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['feature_names', 'target_names', 'DESCR', 'data', 'target'])"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# cria um objeto df ao qual é atribuído o dataframe com os dados de câncer de mama disponível na própria biblioteca Sklearn\n",
"cancer = load_breast_cancer()\n",
"# o objeto criado é um dicionário, com as seguintes chaves:\n",
"cancer.keys()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# cria o dataframe, usando as chaves do dicionário para pegar os dados e os nomes das colunas\n",
"df = pd.DataFrame(cancer['data'], columns=cancer['feature_names'])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>mean radius</th>\n",
" <th>mean texture</th>\n",
" <th>mean perimeter</th>\n",
" <th>mean area</th>\n",
" <th>mean smoothness</th>\n",
" <th>mean compactness</th>\n",
" <th>mean concavity</th>\n",
" <th>mean concave points</th>\n",
" <th>mean symmetry</th>\n",
" <th>mean fractal dimension</th>\n",
" <th>...</th>\n",
" <th>worst radius</th>\n",
" <th>worst texture</th>\n",
" <th>worst perimeter</th>\n",
" <th>worst area</th>\n",
" <th>worst smoothness</th>\n",
" <th>worst compactness</th>\n",
" <th>worst concavity</th>\n",
" <th>worst concave points</th>\n",
" <th>worst symmetry</th>\n",
" <th>worst fractal dimension</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>17.99</td>\n",
" <td>10.38</td>\n",
" <td>122.80</td>\n",
" <td>1001.0</td>\n",
" <td>0.11840</td>\n",
" <td>0.27760</td>\n",
" <td>0.3001</td>\n",
" <td>0.14710</td>\n",
" <td>0.2419</td>\n",
" <td>0.07871</td>\n",
" <td>...</td>\n",
" <td>25.38</td>\n",
" <td>17.33</td>\n",
" <td>184.60</td>\n",
" <td>2019.0</td>\n",
" <td>0.1622</td>\n",
" <td>0.6656</td>\n",
" <td>0.7119</td>\n",
" <td>0.2654</td>\n",
" <td>0.4601</td>\n",
" <td>0.11890</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>20.57</td>\n",
" <td>17.77</td>\n",
" <td>132.90</td>\n",
" <td>1326.0</td>\n",
" <td>0.08474</td>\n",
" <td>0.07864</td>\n",
" <td>0.0869</td>\n",
" <td>0.07017</td>\n",
" <td>0.1812</td>\n",
" <td>0.05667</td>\n",
" <td>...</td>\n",
" <td>24.99</td>\n",
" <td>23.41</td>\n",
" <td>158.80</td>\n",
" <td>1956.0</td>\n",
" <td>0.1238</td>\n",
" <td>0.1866</td>\n",
" <td>0.2416</td>\n",
" <td>0.1860</td>\n",
" <td>0.2750</td>\n",
" <td>0.08902</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>19.69</td>\n",
" <td>21.25</td>\n",
" <td>130.00</td>\n",
" <td>1203.0</td>\n",
" <td>0.10960</td>\n",
" <td>0.15990</td>\n",
" <td>0.1974</td>\n",
" <td>0.12790</td>\n",
" <td>0.2069</td>\n",
" <td>0.05999</td>\n",
" <td>...</td>\n",
" <td>23.57</td>\n",
" <td>25.53</td>\n",
" <td>152.50</td>\n",
" <td>1709.0</td>\n",
" <td>0.1444</td>\n",
" <td>0.4245</td>\n",
" <td>0.4504</td>\n",
" <td>0.2430</td>\n",
" <td>0.3613</td>\n",
" <td>0.08758</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>11.42</td>\n",
" <td>20.38</td>\n",
" <td>77.58</td>\n",
" <td>386.1</td>\n",
" <td>0.14250</td>\n",
" <td>0.28390</td>\n",
" <td>0.2414</td>\n",
" <td>0.10520</td>\n",
" <td>0.2597</td>\n",
" <td>0.09744</td>\n",
" <td>...</td>\n",
" <td>14.91</td>\n",
" <td>26.50</td>\n",
" <td>98.87</td>\n",
" <td>567.7</td>\n",
" <td>0.2098</td>\n",
" <td>0.8663</td>\n",
" <td>0.6869</td>\n",
" <td>0.2575</td>\n",
" <td>0.6638</td>\n",
" <td>0.17300</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>20.29</td>\n",
" <td>14.34</td>\n",
" <td>135.10</td>\n",
" <td>1297.0</td>\n",
" <td>0.10030</td>\n",
" <td>0.13280</td>\n",
" <td>0.1980</td>\n",
" <td>0.10430</td>\n",
" <td>0.1809</td>\n",
" <td>0.05883</td>\n",
" <td>...</td>\n",
" <td>22.54</td>\n",
" <td>16.67</td>\n",
" <td>152.20</td>\n",
" <td>1575.0</td>\n",
" <td>0.1374</td>\n",
" <td>0.2050</td>\n",
" <td>0.4000</td>\n",
" <td>0.1625</td>\n",
" <td>0.2364</td>\n",
" <td>0.07678</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 30 columns</p>\n",
"</div>"
],
"text/plain": [
" mean radius mean texture mean perimeter mean area mean smoothness \\\n",
"0 17.99 10.38 122.80 1001.0 0.11840 \n",
"1 20.57 17.77 132.90 1326.0 0.08474 \n",
"2 19.69 21.25 130.00 1203.0 0.10960 \n",
"3 11.42 20.38 77.58 386.1 0.14250 \n",
"4 20.29 14.34 135.10 1297.0 0.10030 \n",
"\n",
" mean compactness mean concavity mean concave points mean symmetry \\\n",
"0 0.27760 0.3001 0.14710 0.2419 \n",
"1 0.07864 0.0869 0.07017 0.1812 \n",
"2 0.15990 0.1974 0.12790 0.2069 \n",
"3 0.28390 0.2414 0.10520 0.2597 \n",
"4 0.13280 0.1980 0.10430 0.1809 \n",
"\n",
" mean fractal dimension ... worst radius \\\n",
"0 0.07871 ... 25.38 \n",
"1 0.05667 ... 24.99 \n",
"2 0.05999 ... 23.57 \n",
"3 0.09744 ... 14.91 \n",
"4 0.05883 ... 22.54 \n",
"\n",
" worst texture worst perimeter worst area worst smoothness \\\n",
"0 17.33 184.60 2019.0 0.1622 \n",
"1 23.41 158.80 1956.0 0.1238 \n",
"2 25.53 152.50 1709.0 0.1444 \n",
"3 26.50 98.87 567.7 0.2098 \n",
"4 16.67 152.20 1575.0 0.1374 \n",
"\n",
" worst compactness worst concavity worst concave points worst symmetry \\\n",
"0 0.6656 0.7119 0.2654 0.4601 \n",
"1 0.1866 0.2416 0.1860 0.2750 \n",
"2 0.4245 0.4504 0.2430 0.3613 \n",
"3 0.8663 0.6869 0.2575 0.6638 \n",
"4 0.2050 0.4000 0.1625 0.2364 \n",
"\n",
" worst fractal dimension \n",
"0 0.11890 \n",
"1 0.08902 \n",
"2 0.08758 \n",
"3 0.17300 \n",
"4 0.07678 \n",
"\n",
"[5 rows x 30 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# checa cabeçalho\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# importa o support vector classifier (SVC) do scikit-learn - que implementa o SVM\n",
"from sklearn.svm import SVC"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# divide o dataframe em X (atributos) e y (rótulo)\n",
"X = df\n",
"y = cancer['target']\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# divide os dados para treino (70%) e teste (30%)\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
" decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n",
" max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
" tol=0.001, verbose=False)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# treino\n",
"model = SVC()\n",
"model.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# cria um objeto prev com as previsões realizadas pelo modelo\n",
"prev = model.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# importa as bibliotecas para avaliar a eficácia do modelo\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.metrics import confusion_matrix"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.00 0.00 0.00 63\n",
" 1 0.63 1.00 0.77 108\n",
"\n",
"avg / total 0.40 0.63 0.49 171\n",
"\n",
"[[ 0 63]\n",
" [ 0 108]]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\lbour\\Anaconda3\\envs\\k35\\lib\\site-packages\\sklearn\\metrics\\classification.py:1113: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
" 'precision', 'predicted', average, warn_for)\n"
]
}
],
"source": [
"# imprime os resultados obtidos \n",
"# aqui a gente observa que os resultados foram péssimos, isso aconteceu porque não escolhemos argumentos adequados\n",
"# para serem passados no nosso SVM, mas temos uma forma fácil de resolver isso\n",
"print(classification_report(y_test, prev))\n",
"print(confusion_matrix(y_test, prev))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# importa o GridSearchCV cuja função é tentar uma combinação de argumentos e escolher a melhor\n",
"from sklearn.model_selection import GridSearchCV"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# criamos um dicionário com os argumentos a serem testados\n",
"param_grid = {'C':[.1, 1, 10, 100, 1000], 'gamma':[1, .1, .01, .001, .0001]}"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# criamos um novo modelo, como se fosse algoritmo do sklearn e passamos o nosso dicionário\n",
"grid = GridSearchCV(SVC(), param_grid, verbose=1)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 3 folds for each of 25 candidates, totalling 75 fits\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=1)]: Done 75 out of 75 | elapsed: 1.1s finished\n"
]
},
{
"data": {
"text/plain": [
"GridSearchCV(cv=None, error_score='raise',\n",
" estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
" decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n",
" max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
" tol=0.001, verbose=False),\n",
" fit_params={}, iid=True, n_jobs=1,\n",
" param_grid={'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'C': [0.1, 1, 10, 100, 1000]},\n",
" pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
" scoring=None, verbose=1)"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# treinamos ele normalmente\n",
"grid.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'C': 10, 'gamma': 0.0001}"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# melhores parâmetros:\n",
"grid.best_params_"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# fazemos as previsões e...\n",
"grid_predictions = grid.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.98 0.89 0.93 63\n",
" 1 0.94 0.99 0.96 108\n",
"\n",
"avg / total 0.95 0.95 0.95 171\n",
"\n"
]
}
],
"source": [
"# 95% !!! Agora sim!\n",
"print(classification_report(y_test, grid_predictions))"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"### Acabamos de criar, em poucos minutos, uma aplicação capaz de diagnosticar a malignidade de um câncer de mama com 95% de acurária!"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda env:k35]",
"language": "python",
"name": "conda-env-k35-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment