Skip to content

Instantly share code, notes, and snippets.

@KFoxder
Created May 5, 2016 20:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save KFoxder/fbb6e9b4a75b1068a29895870a3c8208 to your computer and use it in GitHub Desktop.
Save KFoxder/fbb6e9b4a75b1068a29895870a3c8208 to your computer and use it in GitHub Desktop.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## General Assembly Data Science Final Project\n",
"# Lending Club Loan Data Analysis\n",
"Samir Poonawala\n",
"\n",
"May 2016"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Potential Predictors:\n",
"- grade / sub_grade\n",
"- emp_length\n",
"- home_ownership\n",
"- annual_inc\n",
"- verification_status\n",
"- [average of] fico_range_low, fico_range_high\n",
"- purpose\n",
"- dti"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Samir\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2902: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" interactivity=interactivity, compiler=compiler, result=result)\n",
"C:\\Users\\Samir\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2902: DtypeWarning: Columns (0,19) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" interactivity=interactivity, compiler=compiler, result=result)\n"
]
}
],
"source": [
"# Read in loan data files\n",
"\n",
"# df1 contains data from 2007 to 2011\n",
"df1 = pd.read_csv('LoanStats3a_securev1.csv')\n",
"\n",
"# df2 contains data from 2012 to 2013\n",
"df2 = pd.read_csv('LoanStats3b_securev1.csv')\n",
"\n",
"# df3 contains data from 2014\n",
"df3 = pd.read_csv('LoanStats3c_securev1.csv')\n",
"\n",
"# df4 contains data from 2015\n",
"df4 = pd.read_csv('LoanStats3d_securev1.csv')\n",
"\n",
"# Create merged dataframe with all loan data\n",
"df_merged = pd.concat([df1, df2, df3, df4])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = df_merged[['funded_amnt', 'emp_length', 'grade', 'sub_grade', 'home_ownership', \n",
" 'annual_inc', 'verification_status', 'fico_range_low', 'fico_range_high']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df_merged.loan_amnt.hist(figsize=(10,5))\n",
"plt.ylabel('Number of Loans')\n",
"plt.xlabel('Loan Amount')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cleaning Loan Status\n",
"\n",
"* Remove \"Current\" as our goal is to look at whether loans were successfully repaid or not\n",
"* Fully paid loans coded as '1' while everything else coded as '0'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df_merged.loan_status.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df_merged['loan_status_clean'] = df_merged['loan_status'].map({'Current': 2, \n",
" 'Fully Paid': 1, 'Does not meet the credit policy. Status:Fully Paid': 1,\n",
" 'Charged Off': 0, 'Does not meet the credit policy. Status:Charged Off': 0,\n",
" 'Late (31-120 days)': 0, 'In Grace Period': 0, 'Late (16-30 days)': 0, 'Default': 0})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cleaning Employment Length"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df_merged['emp_length_clean'] = df_merged.emp_length.str.replace('+','')\n",
"df_merged['emp_length_clean'] = df_merged.emp_length_clean.str.replace('<','')\n",
"df_merged['emp_length_clean'] = df_merged.emp_length_clean.str.replace('years','')\n",
"df_merged['emp_length_clean'] = df_merged.emp_length_clean.str.replace('year','')\n",
"df_merged['emp_length_clean'] = df_merged.emp_length_clean.str.replace('n/a','0')"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 10 years\n",
"0 2 years\n",
"0 8 years\n",
"0 9 years\n",
"Name: emp_length, dtype: object"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_merged.loc[0,'emp_length'].str.replace('+','')"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>member_id</th>\n",
" <th>loan_amnt</th>\n",
" <th>funded_amnt</th>\n",
" <th>funded_amnt_inv</th>\n",
" <th>term</th>\n",
" <th>int_rate</th>\n",
" <th>installment</th>\n",
" <th>grade</th>\n",
" <th>sub_grade</th>\n",
" <th>...</th>\n",
" <th>pct_tl_nvr_dlq</th>\n",
" <th>percent_bc_gt_75</th>\n",
" <th>pub_rec_bankruptcies</th>\n",
" <th>tax_liens</th>\n",
" <th>tot_hi_cred_lim</th>\n",
" <th>total_bal_ex_mort</th>\n",
" <th>total_bc_limit</th>\n",
" <th>total_il_high_credit_limit</th>\n",
" <th>loan_status_clean</th>\n",
" <th>emp_length_clean</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1077501</td>\n",
" <td>1296599.0</td>\n",
" <td>5000.0</td>\n",
" <td>5000.0</td>\n",
" <td>4975.0</td>\n",
" <td>36 months</td>\n",
" <td>10.65%</td>\n",
" <td>162.87</td>\n",
" <td>B</td>\n",
" <td>B2</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1077430</td>\n",
" <td>1314167.0</td>\n",
" <td>2500.0</td>\n",
" <td>2500.0</td>\n",
" <td>2500.0</td>\n",
" <td>60 months</td>\n",
" <td>15.27%</td>\n",
" <td>59.83</td>\n",
" <td>C</td>\n",
" <td>C4</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1077175</td>\n",
" <td>1313524.0</td>\n",
" <td>2400.0</td>\n",
" <td>2400.0</td>\n",
" <td>2400.0</td>\n",
" <td>36 months</td>\n",
" <td>15.96%</td>\n",
" <td>84.33</td>\n",
" <td>C</td>\n",
" <td>C5</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1076863</td>\n",
" <td>1277178.0</td>\n",
" <td>10000.0</td>\n",
" <td>10000.0</td>\n",
" <td>10000.0</td>\n",
" <td>36 months</td>\n",
" <td>13.49%</td>\n",
" <td>339.31</td>\n",
" <td>C</td>\n",
" <td>C1</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1075358</td>\n",
" <td>1311748.0</td>\n",
" <td>3000.0</td>\n",
" <td>3000.0</td>\n",
" <td>3000.0</td>\n",
" <td>60 months</td>\n",
" <td>12.69%</td>\n",
" <td>67.79</td>\n",
" <td>B</td>\n",
" <td>B5</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 117 columns</p>\n",
"</div>"
],
"text/plain": [
" id member_id loan_amnt funded_amnt funded_amnt_inv term \\\n",
"0 1077501 1296599.0 5000.0 5000.0 4975.0 36 months \n",
"1 1077430 1314167.0 2500.0 2500.0 2500.0 60 months \n",
"2 1077175 1313524.0 2400.0 2400.0 2400.0 36 months \n",
"3 1076863 1277178.0 10000.0 10000.0 10000.0 36 months \n",
"4 1075358 1311748.0 3000.0 3000.0 3000.0 60 months \n",
"\n",
" int_rate installment grade sub_grade ... pct_tl_nvr_dlq \\\n",
"0 10.65% 162.87 B B2 ... NaN \n",
"1 15.27% 59.83 C C4 ... NaN \n",
"2 15.96% 84.33 C C5 ... NaN \n",
"3 13.49% 339.31 C C1 ... NaN \n",
"4 12.69% 67.79 B B5 ... NaN \n",
"\n",
" percent_bc_gt_75 pub_rec_bankruptcies tax_liens tot_hi_cred_lim \\\n",
"0 NaN 0.0 0.0 NaN \n",
"1 NaN 0.0 0.0 NaN \n",
"2 NaN 0.0 0.0 NaN \n",
"3 NaN 0.0 0.0 NaN \n",
"4 NaN 0.0 0.0 NaN \n",
"\n",
" total_bal_ex_mort total_bc_limit total_il_high_credit_limit \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"\n",
" loan_status_clean emp_length_clean \n",
"0 1.0 10 \n",
"1 0.0 1 \n",
"2 1.0 10 \n",
"3 1.0 10 \n",
"4 2.0 1 \n",
"\n",
"[5 rows x 117 columns]"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_merged.head()\n",
"#df_merged['emp_length', 'emp_length_clean'].head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# sklearn - feature selection for classification"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.0"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment