Created
June 20, 2017 13:47
-
-
Save lbourbon/e05ad2a887ec520e3b427cc9c118f7d3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# importa nosso amigo Pandas\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# lê o arquivo csv com dados sobre empréstimo - essa é uma versão mais organizada e menor \n", | |
"# do dataset disponível em kaggle.com/wendykan/lending-club-loan-data\n", | |
"df = pd.read_csv('loan.csv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>credit.policy</th>\n", | |
" <th>purpose</th>\n", | |
" <th>int.rate</th>\n", | |
" <th>installment</th>\n", | |
" <th>log.annual.inc</th>\n", | |
" <th>dti</th>\n", | |
" <th>fico</th>\n", | |
" <th>days.with.cr.line</th>\n", | |
" <th>revol.bal</th>\n", | |
" <th>revol.util</th>\n", | |
" <th>inq.last.6mths</th>\n", | |
" <th>delinq.2yrs</th>\n", | |
" <th>pub.rec</th>\n", | |
" <th>not.fully.paid</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>debt_consolidation</td>\n", | |
" <td>0.1189</td>\n", | |
" <td>829.10</td>\n", | |
" <td>11.350407</td>\n", | |
" <td>19.48</td>\n", | |
" <td>737</td>\n", | |
" <td>5639.958333</td>\n", | |
" <td>28854</td>\n", | |
" <td>52.1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>credit_card</td>\n", | |
" <td>0.1071</td>\n", | |
" <td>228.22</td>\n", | |
" <td>11.082143</td>\n", | |
" <td>14.29</td>\n", | |
" <td>707</td>\n", | |
" <td>2760.000000</td>\n", | |
" <td>33623</td>\n", | |
" <td>76.7</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>debt_consolidation</td>\n", | |
" <td>0.1357</td>\n", | |
" <td>366.86</td>\n", | |
" <td>10.373491</td>\n", | |
" <td>11.63</td>\n", | |
" <td>682</td>\n", | |
" <td>4710.000000</td>\n", | |
" <td>3511</td>\n", | |
" <td>25.6</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>debt_consolidation</td>\n", | |
" <td>0.1008</td>\n", | |
" <td>162.34</td>\n", | |
" <td>11.350407</td>\n", | |
" <td>8.10</td>\n", | |
" <td>712</td>\n", | |
" <td>2699.958333</td>\n", | |
" <td>33667</td>\n", | |
" <td>73.2</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1</td>\n", | |
" <td>credit_card</td>\n", | |
" <td>0.1426</td>\n", | |
" <td>102.92</td>\n", | |
" <td>11.299732</td>\n", | |
" <td>14.97</td>\n", | |
" <td>667</td>\n", | |
" <td>4066.000000</td>\n", | |
" <td>4740</td>\n", | |
" <td>39.5</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" credit.policy purpose int.rate installment log.annual.inc \\\n", | |
"0 1 debt_consolidation 0.1189 829.10 11.350407 \n", | |
"1 1 credit_card 0.1071 228.22 11.082143 \n", | |
"2 1 debt_consolidation 0.1357 366.86 10.373491 \n", | |
"3 1 debt_consolidation 0.1008 162.34 11.350407 \n", | |
"4 1 credit_card 0.1426 102.92 11.299732 \n", | |
"\n", | |
" dti fico days.with.cr.line revol.bal revol.util inq.last.6mths \\\n", | |
"0 19.48 737 5639.958333 28854 52.1 0 \n", | |
"1 14.29 707 2760.000000 33623 76.7 0 \n", | |
"2 11.63 682 4710.000000 3511 25.6 1 \n", | |
"3 8.10 712 2699.958333 33667 73.2 1 \n", | |
"4 14.97 667 4066.000000 4740 39.5 0 \n", | |
"\n", | |
" delinq.2yrs pub.rec not.fully.paid \n", | |
"0 0 0 0 \n", | |
"1 0 0 0 \n", | |
"2 0 0 0 \n", | |
"3 0 0 0 \n", | |
"4 1 0 0 " | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# cabeçalho, perceba que a última coluna representa quem são os caloteiros (not.fully.paid == 1)\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"RangeIndex: 9578 entries, 0 to 9577\n", | |
"Data columns (total 19 columns):\n", | |
"credit.policy 9578 non-null int64\n", | |
"int.rate 9578 non-null float64\n", | |
"installment 9578 non-null float64\n", | |
"log.annual.inc 9578 non-null float64\n", | |
"dti 9578 non-null float64\n", | |
"fico 9578 non-null int64\n", | |
"days.with.cr.line 9578 non-null float64\n", | |
"revol.bal 9578 non-null int64\n", | |
"revol.util 9578 non-null float64\n", | |
"inq.last.6mths 9578 non-null int64\n", | |
"delinq.2yrs 9578 non-null int64\n", | |
"pub.rec 9578 non-null int64\n", | |
"not.fully.paid 9578 non-null int64\n", | |
"purpose_credit_card 9578 non-null uint8\n", | |
"purpose_debt_consolidation 9578 non-null uint8\n", | |
"purpose_educational 9578 non-null uint8\n", | |
"purpose_home_improvement 9578 non-null uint8\n", | |
"purpose_major_purchase 9578 non-null uint8\n", | |
"purpose_small_business 9578 non-null uint8\n", | |
"dtypes: float64(6), int64(7), uint8(6)\n", | |
"memory usage: 1.0 MB\n" | |
] | |
} | |
], | |
"source": [ | |
"# como a coluna 'purpose' (propósito) é de strings, temos que modificá-la para int usando o método get_dummies\n", | |
"mudar_colunas = ['purpose']\n", | |
"final_df = pd.get_dummies(df, columns = mudar_colunas, drop_first=True)\n", | |
"final_df.info()\n", | |
"# perceba que novas colunas foram criadas contendo informação se cada linha pertence ou não a um determinado tipo de propósito" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.model_selection import train_test_split" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# divide o dataset em treino e teste\n", | |
"X = final_df.drop('not.fully.paid', axis=1)\n", | |
"y = final_df['not.fully.paid']\n", | |
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n", | |
" max_features=None, max_leaf_nodes=None,\n", | |
" min_impurity_split=1e-07, min_samples_leaf=1,\n", | |
" min_samples_split=2, min_weight_fraction_leaf=0.0,\n", | |
" presort=False, random_state=None, splitter='best')" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#treina o modelo usando o algoritmo Árvore de Decisão do sklearn\n", | |
"from sklearn.tree import DecisionTreeClassifier\n", | |
"dtc = DecisionTreeClassifier()\n", | |
"dtc.fit(X_train, y_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# previsões\n", | |
"prev = dtc.predict(X_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.metrics import accuracy_score" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.7299930410577592" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# conseguimos uma acurácia em torno de 74% em identificar quem vai dar calote ou não\n", | |
"accuracy_score(y_test, prev)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# vamos ver como a gente se sai usando Floresta Aleatória\n", | |
"from sklearn.ensemble import RandomForestClassifier" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# n_estimator é o número de árvores na floresta\n", | |
"rfc = RandomForestClassifier(n_estimators=250)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# mesmo processo de sempre\n", | |
"rfc.fit(X_train, y_train)\n", | |
"prev2 = rfc.predict(X_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.8458594293667363" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# dessa vez conseguimos cerca de 84% de acurácia\n", | |
"accuracy_score(y_test, prev2)" | |
] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [conda env:k35]", | |
"language": "python", | |
"name": "conda-env-k35-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment