KFoxder/.ipynb

## .ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## General Assembly Data Science Final Project\n",
    "# Lending Club Loan Data Analysis\n",
    "Samir Poonawala\n",
    "\n",
    "May 2016"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Potential Predictors:\n",
    "- grade / sub_grade\n",
    "- emp_length\n",
    "- home_ownership\n",
    "- annual_inc\n",
    "- verification_status\n",
    "- [average of] fico_range_low, fico_range_high\n",
    "- purpose\n",
    "- dti"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Samir\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2902: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  interactivity=interactivity, compiler=compiler, result=result)\n",
      "C:\\Users\\Samir\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2902: DtypeWarning: Columns (0,19) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  interactivity=interactivity, compiler=compiler, result=result)\n"
     ]
    }
   ],
   "source": [
    "# Read in loan data files\n",
    "\n",
    "# df1 contains data from 2007 to 2011\n",
    "df1 = pd.read_csv('LoanStats3a_securev1.csv')\n",
    "\n",
    "# df2 contains data from 2012 to 2013\n",
    "df2 = pd.read_csv('LoanStats3b_securev1.csv')\n",
    "\n",
    "# df3 contains data from 2014\n",
    "df3 = pd.read_csv('LoanStats3c_securev1.csv')\n",
    "\n",
    "# df4 contains data from 2015\n",
    "df4 = pd.read_csv('LoanStats3d_securev1.csv')\n",
    "\n",
    "# Create merged dataframe with all loan data\n",
    "df_merged = pd.concat([df1, df2, df3, df4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df = df_merged[['funded_amnt', 'emp_length', 'grade', 'sub_grade', 'home_ownership', \n",
    "               'annual_inc', 'verification_status', 'fico_range_low', 'fico_range_high']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df_merged.loan_amnt.hist(figsize=(10,5))\n",
    "plt.ylabel('Number of Loans')\n",
    "plt.xlabel('Loan Amount')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cleaning Loan Status\n",
    "\n",
    "* Remove \"Current\" as our goal is to look at whether loans were successfully repaid or not\n",
    "* Fully paid loans coded as '1' while everything else coded as '0'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df_merged.loan_status.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df_merged['loan_status_clean'] = df_merged['loan_status'].map({'Current': 2, \n",
    "                                                   'Fully Paid': 1, 'Does not meet the credit policy. Status:Fully Paid': 1,\n",
    "                                                  'Charged Off': 0, 'Does not meet the credit policy. Status:Charged Off': 0,\n",
    "                                                  'Late (31-120 days)': 0, 'In Grace Period': 0, 'Late (16-30 days)': 0, 'Default': 0})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cleaning Employment Length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df_merged['emp_length_clean'] = df_merged.emp_length.str.replace('+','')\n",
    "df_merged['emp_length_clean'] = df_merged.emp_length_clean.str.replace('<','')\n",
    "df_merged['emp_length_clean'] = df_merged.emp_length_clean.str.replace('years','')\n",
    "df_merged['emp_length_clean'] = df_merged.emp_length_clean.str.replace('year','')\n",
    "df_merged['emp_length_clean'] = df_merged.emp_length_clean.str.replace('n/a','0')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    10 years\n",
       "0     2 years\n",
       "0     8 years\n",
       "0     9 years\n",
       "Name: emp_length, dtype: object"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_merged.loc[0,'emp_length'].str.replace('+','')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>member_id</th>\n",
       "      <th>loan_amnt</th>\n",
       "      <th>funded_amnt</th>\n",
       "      <th>funded_amnt_inv</th>\n",
       "      <th>term</th>\n",
       "      <th>int_rate</th>\n",
       "      <th>installment</th>\n",
       "      <th>grade</th>\n",
       "      <th>sub_grade</th>\n",
       "      <th>...</th>\n",
       "      <th>pct_tl_nvr_dlq</th>\n",
       "      <th>percent_bc_gt_75</th>\n",
       "      <th>pub_rec_bankruptcies</th>\n",
       "      <th>tax_liens</th>\n",
       "      <th>tot_hi_cred_lim</th>\n",
       "      <th>total_bal_ex_mort</th>\n",
       "      <th>total_bc_limit</th>\n",
       "      <th>total_il_high_credit_limit</th>\n",
       "      <th>loan_status_clean</th>\n",
       "      <th>emp_length_clean</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1077501</td>\n",
       "      <td>1296599.0</td>\n",
       "      <td>5000.0</td>\n",
       "      <td>5000.0</td>\n",
       "      <td>4975.0</td>\n",
       "      <td>36 months</td>\n",
       "      <td>10.65%</td>\n",
       "      <td>162.87</td>\n",
       "      <td>B</td>\n",
       "      <td>B2</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1077430</td>\n",
       "      <td>1314167.0</td>\n",
       "      <td>2500.0</td>\n",
       "      <td>2500.0</td>\n",
       "      <td>2500.0</td>\n",
       "      <td>60 months</td>\n",
       "      <td>15.27%</td>\n",
       "      <td>59.83</td>\n",
       "      <td>C</td>\n",
       "      <td>C4</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1077175</td>\n",
       "      <td>1313524.0</td>\n",
       "      <td>2400.0</td>\n",
       "      <td>2400.0</td>\n",
       "      <td>2400.0</td>\n",
       "      <td>36 months</td>\n",
       "      <td>15.96%</td>\n",
       "      <td>84.33</td>\n",
       "      <td>C</td>\n",
       "      <td>C5</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1076863</td>\n",
       "      <td>1277178.0</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>36 months</td>\n",
       "      <td>13.49%</td>\n",
       "      <td>339.31</td>\n",
       "      <td>C</td>\n",
       "      <td>C1</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1075358</td>\n",
       "      <td>1311748.0</td>\n",
       "      <td>3000.0</td>\n",
       "      <td>3000.0</td>\n",
       "      <td>3000.0</td>\n",
       "      <td>60 months</td>\n",
       "      <td>12.69%</td>\n",
       "      <td>67.79</td>\n",
       "      <td>B</td>\n",
       "      <td>B5</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows Ã— 117 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        id  member_id  loan_amnt  funded_amnt  funded_amnt_inv        term  \\\n",
       "0  1077501  1296599.0     5000.0       5000.0           4975.0   36 months   \n",
       "1  1077430  1314167.0     2500.0       2500.0           2500.0   60 months   \n",
       "2  1077175  1313524.0     2400.0       2400.0           2400.0   36 months   \n",
       "3  1076863  1277178.0    10000.0      10000.0          10000.0   36 months   \n",
       "4  1075358  1311748.0     3000.0       3000.0           3000.0   60 months   \n",
       "\n",
       "  int_rate  installment grade sub_grade       ...        pct_tl_nvr_dlq  \\\n",
       "0   10.65%       162.87     B        B2       ...                   NaN   \n",
       "1   15.27%        59.83     C        C4       ...                   NaN   \n",
       "2   15.96%        84.33     C        C5       ...                   NaN   \n",
       "3   13.49%       339.31     C        C1       ...                   NaN   \n",
       "4   12.69%        67.79     B        B5       ...                   NaN   \n",
       "\n",
       "  percent_bc_gt_75 pub_rec_bankruptcies  tax_liens tot_hi_cred_lim  \\\n",
       "0              NaN                  0.0        0.0             NaN   \n",
       "1              NaN                  0.0        0.0             NaN   \n",
       "2              NaN                  0.0        0.0             NaN   \n",
       "3              NaN                  0.0        0.0             NaN   \n",
       "4              NaN                  0.0        0.0             NaN   \n",
       "\n",
       "  total_bal_ex_mort total_bc_limit total_il_high_credit_limit  \\\n",
       "0               NaN            NaN                        NaN   \n",
       "1               NaN            NaN                        NaN   \n",
       "2               NaN            NaN                        NaN   \n",
       "3               NaN            NaN                        NaN   \n",
       "4               NaN            NaN                        NaN   \n",
       "\n",
       "  loan_status_clean emp_length_clean  \n",
       "0               1.0              10   \n",
       "1               0.0               1   \n",
       "2               1.0              10   \n",
       "3               1.0              10   \n",
       "4               2.0               1   \n",
       "\n",
       "[5 rows x 117 columns]"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_merged.head()\n",
    "#df_merged['emp_length', 'emp_length_clean'].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# sklearn - feature selection for classification"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## General Assembly Data Science Final Project\n",
	"# Lending Club Loan Data Analysis\n",
	"Samir Poonawala\n",
	"\n",
	"May 2016"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"import matplotlib.pyplot as plt\n",
	"%matplotlib inline"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Potential Predictors:\n",
	"- grade / sub_grade\n",
	"- emp_length\n",
	"- home_ownership\n",
	"- annual_inc\n",
	"- verification_status\n",
	"- [average of] fico_range_low, fico_range_high\n",
	"- purpose\n",
	"- dti"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"C:\\Users\\Samir\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2902: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.\n",
	" interactivity=interactivity, compiler=compiler, result=result)\n",
	"C:\\Users\\Samir\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2902: DtypeWarning: Columns (0,19) have mixed types. Specify dtype option on import or set low_memory=False.\n",
	" interactivity=interactivity, compiler=compiler, result=result)\n"
	]
	}
	],
	"source": [
	"# Read in loan data files\n",
	"\n",
	"# df1 contains data from 2007 to 2011\n",
	"df1 = pd.read_csv('LoanStats3a_securev1.csv')\n",
	"\n",
	"# df2 contains data from 2012 to 2013\n",
	"df2 = pd.read_csv('LoanStats3b_securev1.csv')\n",
	"\n",
	"# df3 contains data from 2014\n",
	"df3 = pd.read_csv('LoanStats3c_securev1.csv')\n",
	"\n",
	"# df4 contains data from 2015\n",
	"df4 = pd.read_csv('LoanStats3d_securev1.csv')\n",
	"\n",
	"# Create merged dataframe with all loan data\n",
	"df_merged = pd.concat([df1, df2, df3, df4])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"df = df_merged[['funded_amnt', 'emp_length', 'grade', 'sub_grade', 'home_ownership', \n",
	" 'annual_inc', 'verification_status', 'fico_range_low', 'fico_range_high']]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"df_merged.loan_amnt.hist(figsize=(10,5))\n",
	"plt.ylabel('Number of Loans')\n",
	"plt.xlabel('Loan Amount')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Cleaning Loan Status\n",
	"\n",
	"* Remove \"Current\" as our goal is to look at whether loans were successfully repaid or not\n",
	"* Fully paid loans coded as '1' while everything else coded as '0'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"df_merged.loan_status.value_counts()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"df_merged['loan_status_clean'] = df_merged['loan_status'].map({'Current': 2, \n",
	" 'Fully Paid': 1, 'Does not meet the credit policy. Status:Fully Paid': 1,\n",
	" 'Charged Off': 0, 'Does not meet the credit policy. Status:Charged Off': 0,\n",
	" 'Late (31-120 days)': 0, 'In Grace Period': 0, 'Late (16-30 days)': 0, 'Default': 0})"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Cleaning Employment Length"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 51,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"df_merged['emp_length_clean'] = df_merged.emp_length.str.replace('+','')\n",
	"df_merged['emp_length_clean'] = df_merged.emp_length_clean.str.replace('<','')\n",
	"df_merged['emp_length_clean'] = df_merged.emp_length_clean.str.replace('years','')\n",
	"df_merged['emp_length_clean'] = df_merged.emp_length_clean.str.replace('year','')\n",
	"df_merged['emp_length_clean'] = df_merged.emp_length_clean.str.replace('n/a','0')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 50,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0 10 years\n",
	"0 2 years\n",
	"0 8 years\n",
	"0 9 years\n",
	"Name: emp_length, dtype: object"
	]
	},
	"execution_count": 50,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df_merged.loc[0,'emp_length'].str.replace('+','')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 52,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>id</th>\n",
	" <th>member_id</th>\n",
	" <th>loan_amnt</th>\n",
	" <th>funded_amnt</th>\n",
	" <th>funded_amnt_inv</th>\n",
	" <th>term</th>\n",
	" <th>int_rate</th>\n",
	" <th>installment</th>\n",
	" <th>grade</th>\n",
	" <th>sub_grade</th>\n",
	" <th>...</th>\n",
	" <th>pct_tl_nvr_dlq</th>\n",
	" <th>percent_bc_gt_75</th>\n",
	" <th>pub_rec_bankruptcies</th>\n",
	" <th>tax_liens</th>\n",
	" <th>tot_hi_cred_lim</th>\n",
	" <th>total_bal_ex_mort</th>\n",
	" <th>total_bc_limit</th>\n",
	" <th>total_il_high_credit_limit</th>\n",
	" <th>loan_status_clean</th>\n",
	" <th>emp_length_clean</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>1077501</td>\n",
	" <td>1296599.0</td>\n",
	" <td>5000.0</td>\n",
	" <td>5000.0</td>\n",
	" <td>4975.0</td>\n",
	" <td>36 months</td>\n",
	" <td>10.65%</td>\n",
	" <td>162.87</td>\n",
	" <td>B</td>\n",
	" <td>B2</td>\n",
	" <td>...</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>1.0</td>\n",
	" <td>10</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>1077430</td>\n",
	" <td>1314167.0</td>\n",
	" <td>2500.0</td>\n",
	" <td>2500.0</td>\n",
	" <td>2500.0</td>\n",
	" <td>60 months</td>\n",
	" <td>15.27%</td>\n",
	" <td>59.83</td>\n",
	" <td>C</td>\n",
	" <td>C4</td>\n",
	" <td>...</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>0.0</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>1077175</td>\n",
	" <td>1313524.0</td>\n",
	" <td>2400.0</td>\n",
	" <td>2400.0</td>\n",
	" <td>2400.0</td>\n",
	" <td>36 months</td>\n",
	" <td>15.96%</td>\n",
	" <td>84.33</td>\n",
	" <td>C</td>\n",
	" <td>C5</td>\n",
	" <td>...</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>1.0</td>\n",
	" <td>10</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>1076863</td>\n",
	" <td>1277178.0</td>\n",
	" <td>10000.0</td>\n",
	" <td>10000.0</td>\n",
	" <td>10000.0</td>\n",
	" <td>36 months</td>\n",
	" <td>13.49%</td>\n",
	" <td>339.31</td>\n",
	" <td>C</td>\n",
	" <td>C1</td>\n",
	" <td>...</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>1.0</td>\n",
	" <td>10</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>1075358</td>\n",
	" <td>1311748.0</td>\n",
	" <td>3000.0</td>\n",
	" <td>3000.0</td>\n",
	" <td>3000.0</td>\n",
	" <td>60 months</td>\n",
	" <td>12.69%</td>\n",
	" <td>67.79</td>\n",
	" <td>B</td>\n",
	" <td>B5</td>\n",
	" <td>...</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>2.0</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>5 rows Ã— 117 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" id member_id loan_amnt funded_amnt funded_amnt_inv term \\\n",
	"0 1077501 1296599.0 5000.0 5000.0 4975.0 36 months \n",
	"1 1077430 1314167.0 2500.0 2500.0 2500.0 60 months \n",
	"2 1077175 1313524.0 2400.0 2400.0 2400.0 36 months \n",
	"3 1076863 1277178.0 10000.0 10000.0 10000.0 36 months \n",
	"4 1075358 1311748.0 3000.0 3000.0 3000.0 60 months \n",
	"\n",
	" int_rate installment grade sub_grade ... pct_tl_nvr_dlq \\\n",
	"0 10.65% 162.87 B B2 ... NaN \n",
	"1 15.27% 59.83 C C4 ... NaN \n",
	"2 15.96% 84.33 C C5 ... NaN \n",
	"3 13.49% 339.31 C C1 ... NaN \n",
	"4 12.69% 67.79 B B5 ... NaN \n",
	"\n",
	" percent_bc_gt_75 pub_rec_bankruptcies tax_liens tot_hi_cred_lim \\\n",
	"0 NaN 0.0 0.0 NaN \n",
	"1 NaN 0.0 0.0 NaN \n",
	"2 NaN 0.0 0.0 NaN \n",
	"3 NaN 0.0 0.0 NaN \n",
	"4 NaN 0.0 0.0 NaN \n",
	"\n",
	" total_bal_ex_mort total_bc_limit total_il_high_credit_limit \\\n",
	"0 NaN NaN NaN \n",
	"1 NaN NaN NaN \n",
	"2 NaN NaN NaN \n",
	"3 NaN NaN NaN \n",
	"4 NaN NaN NaN \n",
	"\n",
	" loan_status_clean emp_length_clean \n",
	"0 1.0 10 \n",
	"1 0.0 1 \n",
	"2 1.0 10 \n",
	"3 1.0 10 \n",
	"4 2.0 1 \n",
	"\n",
	"[5 rows x 117 columns]"
	]
	},
	"execution_count": 52,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df_merged.head()\n",
	"#df_merged['emp_length', 'emp_length_clean'].head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# sklearn - feature selection for classification"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.0"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}