Skip to content

Instantly share code, notes, and snippets.

@lbourbon
Created June 20, 2017 13:47
Show Gist options
  • Save lbourbon/e05ad2a887ec520e3b427cc9c118f7d3 to your computer and use it in GitHub Desktop.
Save lbourbon/e05ad2a887ec520e3b427cc9c118f7d3 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# importa nosso amigo Pandas\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# lê o arquivo csv com dados sobre empréstimo - essa é uma versão mais organizada e menor \n",
"# do dataset disponível em kaggle.com/wendykan/lending-club-loan-data\n",
"df = pd.read_csv('loan.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>credit.policy</th>\n",
" <th>purpose</th>\n",
" <th>int.rate</th>\n",
" <th>installment</th>\n",
" <th>log.annual.inc</th>\n",
" <th>dti</th>\n",
" <th>fico</th>\n",
" <th>days.with.cr.line</th>\n",
" <th>revol.bal</th>\n",
" <th>revol.util</th>\n",
" <th>inq.last.6mths</th>\n",
" <th>delinq.2yrs</th>\n",
" <th>pub.rec</th>\n",
" <th>not.fully.paid</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>debt_consolidation</td>\n",
" <td>0.1189</td>\n",
" <td>829.10</td>\n",
" <td>11.350407</td>\n",
" <td>19.48</td>\n",
" <td>737</td>\n",
" <td>5639.958333</td>\n",
" <td>28854</td>\n",
" <td>52.1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>credit_card</td>\n",
" <td>0.1071</td>\n",
" <td>228.22</td>\n",
" <td>11.082143</td>\n",
" <td>14.29</td>\n",
" <td>707</td>\n",
" <td>2760.000000</td>\n",
" <td>33623</td>\n",
" <td>76.7</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>debt_consolidation</td>\n",
" <td>0.1357</td>\n",
" <td>366.86</td>\n",
" <td>10.373491</td>\n",
" <td>11.63</td>\n",
" <td>682</td>\n",
" <td>4710.000000</td>\n",
" <td>3511</td>\n",
" <td>25.6</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>debt_consolidation</td>\n",
" <td>0.1008</td>\n",
" <td>162.34</td>\n",
" <td>11.350407</td>\n",
" <td>8.10</td>\n",
" <td>712</td>\n",
" <td>2699.958333</td>\n",
" <td>33667</td>\n",
" <td>73.2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>credit_card</td>\n",
" <td>0.1426</td>\n",
" <td>102.92</td>\n",
" <td>11.299732</td>\n",
" <td>14.97</td>\n",
" <td>667</td>\n",
" <td>4066.000000</td>\n",
" <td>4740</td>\n",
" <td>39.5</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" credit.policy purpose int.rate installment log.annual.inc \\\n",
"0 1 debt_consolidation 0.1189 829.10 11.350407 \n",
"1 1 credit_card 0.1071 228.22 11.082143 \n",
"2 1 debt_consolidation 0.1357 366.86 10.373491 \n",
"3 1 debt_consolidation 0.1008 162.34 11.350407 \n",
"4 1 credit_card 0.1426 102.92 11.299732 \n",
"\n",
" dti fico days.with.cr.line revol.bal revol.util inq.last.6mths \\\n",
"0 19.48 737 5639.958333 28854 52.1 0 \n",
"1 14.29 707 2760.000000 33623 76.7 0 \n",
"2 11.63 682 4710.000000 3511 25.6 1 \n",
"3 8.10 712 2699.958333 33667 73.2 1 \n",
"4 14.97 667 4066.000000 4740 39.5 0 \n",
"\n",
" delinq.2yrs pub.rec not.fully.paid \n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"2 0 0 0 \n",
"3 0 0 0 \n",
"4 1 0 0 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# cabeçalho, perceba que a última coluna representa quem são os caloteiros (not.fully.paid == 1)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 9578 entries, 0 to 9577\n",
"Data columns (total 19 columns):\n",
"credit.policy 9578 non-null int64\n",
"int.rate 9578 non-null float64\n",
"installment 9578 non-null float64\n",
"log.annual.inc 9578 non-null float64\n",
"dti 9578 non-null float64\n",
"fico 9578 non-null int64\n",
"days.with.cr.line 9578 non-null float64\n",
"revol.bal 9578 non-null int64\n",
"revol.util 9578 non-null float64\n",
"inq.last.6mths 9578 non-null int64\n",
"delinq.2yrs 9578 non-null int64\n",
"pub.rec 9578 non-null int64\n",
"not.fully.paid 9578 non-null int64\n",
"purpose_credit_card 9578 non-null uint8\n",
"purpose_debt_consolidation 9578 non-null uint8\n",
"purpose_educational 9578 non-null uint8\n",
"purpose_home_improvement 9578 non-null uint8\n",
"purpose_major_purchase 9578 non-null uint8\n",
"purpose_small_business 9578 non-null uint8\n",
"dtypes: float64(6), int64(7), uint8(6)\n",
"memory usage: 1.0 MB\n"
]
}
],
"source": [
"# como a coluna 'purpose' (propósito) é de strings, temos que modificá-la para int usando o método get_dummies\n",
"mudar_colunas = ['purpose']\n",
"final_df = pd.get_dummies(df, columns = mudar_colunas, drop_first=True)\n",
"final_df.info()\n",
"# perceba que novas colunas foram criadas contendo informação se cada linha pertence ou não a um determinado tipo de propósito"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# divide o dataset em treino e teste\n",
"X = final_df.drop('not.fully.paid', axis=1)\n",
"y = final_df['not.fully.paid']\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n",
" max_features=None, max_leaf_nodes=None,\n",
" min_impurity_split=1e-07, min_samples_leaf=1,\n",
" min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
" presort=False, random_state=None, splitter='best')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#treina o modelo usando o algoritmo Árvore de Decisão do sklearn\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"dtc = DecisionTreeClassifier()\n",
"dtc.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# previsões\n",
"prev = dtc.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.metrics import accuracy_score"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.7299930410577592"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# conseguimos uma acurácia em torno de 74% em identificar quem vai dar calote ou não\n",
"accuracy_score(y_test, prev)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# vamos ver como a gente se sai usando Floresta Aleatória\n",
"from sklearn.ensemble import RandomForestClassifier"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# n_estimator é o número de árvores na floresta\n",
"rfc = RandomForestClassifier(n_estimators=250)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# mesmo processo de sempre\n",
"rfc.fit(X_train, y_train)\n",
"prev2 = rfc.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.8458594293667363"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# dessa vez conseguimos cerca de 84% de acurácia\n",
"accuracy_score(y_test, prev2)"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda env:k35]",
"language": "python",
"name": "conda-env-k35-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment