Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save mikewlange/e23abda09335bfd5982cdd9712ef229e to your computer and use it in GitHub Desktop.
Save mikewlange/e23abda09335bfd5982cdd9712ef229e to your computer and use it in GitHub Desktop.
e23abda09335bfd5982cdd9712ef229e
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"celltoolbar": "Initialization Cell",
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python",
"version": "3.6.9",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"toc": {
"nav_menu": {},
"number_sections": false,
"sideBar": false,
"skip_h1_title": false,
"base_numbering": 1,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": false,
"toc_window_display": false
},
"varInspector": {
"window_display": false,
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"library": "var_list.py",
"delete_cmd_prefix": "del ",
"delete_cmd_postfix": "",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"library": "var_list.r",
"delete_cmd_prefix": "rm(",
"delete_cmd_postfix": ") ",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
]
},
"gist": {
"id": "",
"data": {
"description": "/Pictures/MLTOOLING/ml-workspace/gpu-flavor/Demos/interpret/examples/python/notebooks/Fannie Mae Single-Family Historical Loan Performance 2016-2018.ipynb",
"public": true
},
"public": false,
"description": "/Pictures/MLTOOLING/ml-workspace/gpu-flavor/Demos/interpret/examples/python/notebooks/Fannie Mae Single-Family Historical Loan Performance 2016-2018.ipynb.ipynb",
"extension": ".ipynb"
},
"colab": {
"name": "e23abda09335bfd5982cdd9712ef229e",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/mikewlange/e23abda09335bfd5982cdd9712ef229e/notebook.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"trusted": true,
"id": "2lzCZiuwkRZ-",
"colab_type": "code",
"colab": {},
"outputId": "52972d9a-da5c-4e86-9661-ac539933f883"
},
"source": [
"!jupyter labextension install dask-labextension"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"Building jupyterlab assets (build:prod:minimize)\n",
"\\"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "iFO823_Dl_9G",
"colab_type": "code",
"colab": {}
},
"source": [
"import os\n",
"\n",
"os.environ[\"MODIN_ENGINE\"] = \"dask\" # Modin will use Ray\n",
"# os.environ[\"MODIN_ENGINE\"] = \"dask\" # Modin will use Dask\n",
"\n",
"import modin.pandas as pd\n",
"\n",
"# featuretools for automated feature engineering\n",
"import featuretools as ft\n",
"from beakerx import *"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"start_time": "2019-11-30T20:41:12.281327Z",
"end_time": "2019-11-30T20:41:12.286783Z"
},
"init_cell": true,
"trusted": true,
"id": "I4RQT50_kRaE",
"colab_type": "code",
"colab": {}
},
"source": [
"import numpy as np\n",
"import h5py\n",
"import torch\n",
"from torch.utils.data import Dataset\n",
"from torch.utils.data import DataLoader"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"start_time": "2019-11-30T20:41:12.295668Z",
"end_time": "2019-11-30T20:41:12.304245Z"
},
"init_cell": true,
"trusted": true,
"id": "tPT84WgSkRaG",
"colab_type": "code",
"colab": {}
},
"source": [
"DATA_DIR = \"Housing\"\n",
"OUT = \"./\"\n",
"TARGET = \"foreclosure_status\"\n",
"NON_PREDICTORS = [TARGET, \"id\"]\n",
"MINIMUM_TRACKING_QUARTERS = 7"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"start_time": "2019-11-30T20:41:12.308424Z",
"end_time": "2019-11-30T20:41:12.325377Z"
},
"init_cell": true,
"trusted": true,
"id": "XUbdBGnckRaJ",
"colab_type": "code",
"colab": {},
"outputId": "17f56869-5a42-4f1d-c203-fb438335eaca"
},
"source": [
"# IMPORTING LIBRARIES\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from PIL import Image\n",
"\n",
"%matplotlib inline\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import itertools\n",
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"from wordcloud import WordCloud, STOPWORDS\n",
"import io\n",
"import base64\n",
"from matplotlib import rc, animation\n",
"import folium\n",
"import folium.plugins\n",
"import plotly.offline as py\n",
"\n",
"py.init_notebook_mode(connected=True)\n",
"import plotly.graph_objs as go\n",
"import plotly.tools as tls\n",
"import os"
],
"execution_count": 0,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
" <script type=\"text/javascript\">\n",
" window.PlotlyConfig = {MathJaxConfig: 'local'};\n",
" if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n",
" if (typeof require !== 'undefined') {\n",
" require.undef(\"plotly\");\n",
" requirejs.config({\n",
" paths: {\n",
" 'plotly': ['https://cdn.plot.ly/plotly-latest.min']\n",
" }\n",
" });\n",
" require(['plotly'], function(Plotly) {\n",
" window._Plotly = Plotly;\n",
" });\n",
" }\n",
" </script>\n",
" "
]
},
"metadata": {
"tags": []
}
}
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"start_time": "2019-11-30T20:41:12.327100Z",
"end_time": "2019-11-30T20:41:12.343149Z"
},
"init_cell": true,
"trusted": true,
"id": "i-i0unA9kRaL",
"colab_type": "code",
"colab": {},
"outputId": "58ae0efa-6cba-443a-e8ef-422114afe934"
},
"source": [
""
],
"execution_count": 0,
"outputs": [
{
"output_type": "error",
"ename": "ModuleNotFoundError",
"evalue": "No module named 'modin'",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-60-075a2e977ad0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# os.environ[\"MODIN_ENGINE\"] = \"dask\" # Modin will use Dask\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mmodin\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;31m# featuretools for automated feature engineering\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'modin'"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-29T19:59:42.651784Z",
"start_time": "2019-11-29T19:59:42.642907Z"
},
"trusted": true,
"id": "37PPY-ZDkRaO",
"colab_type": "code",
"colab": {}
},
"source": [
"HEADERS = {\n",
" \"Acquisition\": [\n",
" \"id\",\n",
" \"channel\",\n",
" \"seller\",\n",
" \"interest_rate\",\n",
" \"balance\",\n",
" \"loan_term\",\n",
" \"origination_date\",\n",
" \"first_payment_date\",\n",
" \"ltv\",\n",
" \"cltv\",\n",
" \"borrower_count\",\n",
" \"dti\",\n",
" \"borrower_credit_score\",\n",
" \"first_time_homebuyer\",\n",
" \"loan_purpose\",\n",
" \"property_type\",\n",
" \"unit_count\",\n",
" \"occupancy_status\",\n",
" \"property_state\",\n",
" \"zip\",\n",
" \"insurance_percentage\",\n",
" \"product_type\",\n",
" \"co_borrower_credit_score\",\n",
" \"mortgage_insurance_type\",\n",
" \"relocation_mortgage_indicator\",\n",
" ],\n",
" \"Performance\": [\n",
" \"id\",\n",
" \"reporting_period\",\n",
" \"servicer_name\",\n",
" \"interest_rate\",\n",
" \"balance\",\n",
" \"loan_age\",\n",
" \"months_to_maturity\",\n",
" \"months_to_legal_maturity\",\n",
" \"maturity_date\",\n",
" \"msa\",\n",
" \"delinquency_status\",\n",
" \"modification_flag\",\n",
" \"zero_balance_code\",\n",
" \"zero_balance_date\",\n",
" \"last_paid_installment_date\",\n",
" \"foreclosure_date\",\n",
" \"disposition_date\",\n",
" \"foreclosure_costs\",\n",
" \"property_repair_costs\",\n",
" \"recovery_costs\",\n",
" \"misc_costs\",\n",
" \"tax_costs\",\n",
" \"sale_proceeds\",\n",
" \"credit_enhancement_proceeds\",\n",
" \"repurchase_proceeds\",\n",
" \"other_foreclosure_proceeds\",\n",
" \"non_interest_bearing_balance\",\n",
" \"principal_forgiveness_balance\",\n",
" \"repurchase_make_whole_proceeds_flag\",\n",
" \"foreclosure_principal_write_off_amount\",\n",
" \"servicing_activity_indicator\",\n",
" ],\n",
"}\n",
"\n",
"SELECT = {\n",
" \"Acquisition\": HEADERS[\"Acquisition\"],\n",
" \"Performance\": [\"id\", \"foreclosure_date\"],\n",
"}"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"start_time": "2019-11-26T16:49:38.530Z"
},
"trusted": true,
"id": "p94afJ6DkRaQ",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-26T20:17:27.477872Z",
"start_time": "2019-11-26T20:17:04.056246Z"
},
"trusted": true,
"id": "N8aqdbxckRaS",
"colab_type": "code",
"colab": {}
},
"source": [
"ac_2016_q1 = pd.read_csv(\n",
" \"Housing/Acquisition_2016Q1.txt\", sep=\"|\", names=HEADERS[\"Acquisition\"]\n",
")\n",
"# ac_2016_q2 = pd.read_csv(\"Housing/Acquisition_2016Q2.txt\", sep=\"|\", names=HEADERS[\"Acquisition\"])\n",
"# ac_2016_q3 = pd.read_csv(\"Housing/Acquisition_2016Q3.txt\", sep=\"|\", names=HEADERS[\"Acquisition\"])\n",
"# ac_2016_q4 = pd.read_csv(\"Housing/Acquisition_2016Q4.txt\", sep=\"|\", names=HEADERS[\"Acquisition\"])\n",
"\n",
"per_2016_q1 = pd.read_csv(\n",
" \"Housing/Performance_2016Q1.txt\", sep=\"|\", names=HEADERS[\"Performance\"]\n",
")\n",
"# per_2016_q2 = pd.read_csv(\"Housing/Performance_2016Q2.txt\", sep=\"|\",names=HEADERS[\"Performance\"])\n",
"# per_2016_q3 = pd.read_csv(\"Housing/Performance_2016Q3.txt\", sep=\"|\",names=HEADERS[\"Performance\"])\n",
"# per_2016_q4 = pd.read_csv(\"Housing/Performance_2016Q4.txt\", sep=\"|\",names=HEADERS[\"Performance\"])"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-26T20:17:31.301089Z",
"start_time": "2019-11-26T20:17:31.288479Z"
},
"trusted": true,
"id": "-YSCigDZkRaV",
"colab_type": "code",
"colab": {},
"outputId": "cfef7099-4deb-496d-d5d6-aea143bc1b81"
},
"source": [
"print(\"Acquisition_2016Q1 :\", ac_2016_q1.shape)\n",
"print(\"Performance_2016Q1 :\", per_2016_q1.shape)\n",
"# print (\"Acquisition_2016Q2 :\",ac_2016_q2.shape)\n",
"# print (\"Performance_2016Q2 :\",per_2016_q2.shape)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"Acquisition_2016Q1 : (404588, 25)\n",
"Performance_2016Q1 : (14263096, 31)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"trusted": true,
"id": "oFKHTvMckRaX",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-26T20:17:46.082597Z",
"start_time": "2019-11-26T20:17:46.031373Z"
},
"trusted": true,
"id": "UiGfnDixkRaZ",
"colab_type": "code",
"colab": {},
"outputId": "95df1c1f-9710-44b6-bca3-56346b0d4d74"
},
"source": [
"TableDisplay(\"Acquisition_2016Q1\")\n",
"TableDisplay(ac_2016_q1.head(3))"
],
"execution_count": 0,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "bc804385f4f549efbf4eb279607187ab",
"version_major": 2,
"version_minor": 0
}
},
"metadata": {
"tags": []
}
}
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-26T16:06:08.257531Z",
"start_time": "2019-11-26T16:06:08.212600Z"
},
"trusted": true,
"id": "lvqWFTF8kRac",
"colab_type": "code",
"colab": {},
"outputId": "017bd996-e28e-4331-8630-e3b9b303f556"
},
"source": [
"TableDisplay(\"Performance_2016Q1\")\n",
"per_2016_q1.head(3)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>reporting_period</th>\n",
" <th>servicer_name</th>\n",
" <th>interest_rate</th>\n",
" <th>balance</th>\n",
" <th>loan_age</th>\n",
" <th>months_to_maturity</th>\n",
" <th>months_to_legal_maturity</th>\n",
" <th>maturity_date</th>\n",
" <th>msa</th>\n",
" <th>...</th>\n",
" <th>tax_costs</th>\n",
" <th>sale_proceeds</th>\n",
" <th>credit_enhancement_proceeds</th>\n",
" <th>repurchase_proceeds</th>\n",
" <th>other_foreclosure_proceeds</th>\n",
" <th>non_interest_bearing_balance</th>\n",
" <th>principal_forgiveness_balance</th>\n",
" <th>repurchase_make_whole_proceeds_flag</th>\n",
" <th>foreclosure_principal_write_off_amount</th>\n",
" <th>servicing_activity_indicator</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100000512540</td>\n",
" <td>02/01/2016</td>\n",
" <td>OTHER</td>\n",
" <td>3.75</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>359</td>\n",
" <td>359.0</td>\n",
" <td>01/2046</td>\n",
" <td>12260</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>N</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100000512540</td>\n",
" <td>03/01/2016</td>\n",
" <td>NaN</td>\n",
" <td>3.75</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>358</td>\n",
" <td>357.0</td>\n",
" <td>01/2046</td>\n",
" <td>12260</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>N</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100000512540</td>\n",
" <td>04/01/2016</td>\n",
" <td>NaN</td>\n",
" <td>3.75</td>\n",
" <td>NaN</td>\n",
" <td>3</td>\n",
" <td>357</td>\n",
" <td>356.0</td>\n",
" <td>01/2046</td>\n",
" <td>12260</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>N</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows × 31 columns</p>\n",
"</div>"
],
"text/plain": [
" id reporting_period servicer_name interest_rate balance \\\n",
"0 100000512540 02/01/2016 OTHER 3.75 NaN \n",
"1 100000512540 03/01/2016 NaN 3.75 NaN \n",
"2 100000512540 04/01/2016 NaN 3.75 NaN \n",
"\n",
" loan_age months_to_maturity months_to_legal_maturity maturity_date \\\n",
"0 1 359 359.0 01/2046 \n",
"1 2 358 357.0 01/2046 \n",
"2 3 357 356.0 01/2046 \n",
"\n",
" msa ... tax_costs sale_proceeds credit_enhancement_proceeds \\\n",
"0 12260 ... NaN NaN NaN \n",
"1 12260 ... NaN NaN NaN \n",
"2 12260 ... NaN NaN NaN \n",
"\n",
" repurchase_proceeds other_foreclosure_proceeds non_interest_bearing_balance \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"\n",
" principal_forgiveness_balance repurchase_make_whole_proceeds_flag \\\n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"\n",
" foreclosure_principal_write_off_amount servicing_activity_indicator \n",
"0 NaN N \n",
"1 NaN N \n",
"2 NaN N \n",
"\n",
"[3 rows x 31 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 25
}
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-26T16:16:00.720822Z",
"start_time": "2019-11-26T16:15:56.157384Z"
},
"trusted": true,
"id": "4_1HssKzkRae",
"colab_type": "code",
"colab": {},
"outputId": "8d002116-3bc6-4577-c061-3db1ae8c8169"
},
"source": [
"plt.figure(figsize=(15, 20))\n",
"\n",
"plt.subplot(231)\n",
"sns.heatmap(\n",
" pd.DataFrame(per_2016_q1.isnull().sum() / per_2016_q1.shape[0] * 100),\n",
" annot=True,\n",
" cmap=sns.color_palette(\"cool\"),\n",
" linewidth=1,\n",
" linecolor=\"white\",\n",
")\n",
"plt.title(\"per_2016_q1\")"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Text(0.5, 1.0, 'per_2016_q1')"
]
},
"metadata": {
"tags": []
},
"execution_count": 23
},
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 1080x1440 with 2 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
}
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-26T16:16:55.892831Z",
"start_time": "2019-11-26T16:16:55.132284Z"
},
"trusted": true,
"id": "8XfJg4MQkRag",
"colab_type": "code",
"colab": {},
"outputId": "4e13302a-efe3-43c4-cd59-f24649fe7825"
},
"source": [
"plt.figure(figsize=(15, 20))\n",
"\n",
"plt.subplot(231)\n",
"sns.heatmap(\n",
" pd.DataFrame(ac_2016_q1.isnull().sum() / ac_2016_q1.shape[0] * 100),\n",
" annot=True,\n",
" cmap=sns.color_palette(\"cool\"),\n",
" linewidth=1,\n",
" linecolor=\"white\",\n",
")\n",
"plt.title(\"ac_2016_q1\")"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Text(0.5, 1.0, 'ac_2016_q1')"
]
},
"metadata": {
"tags": []
},
"execution_count": 24
},
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 1080x1440 with 2 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
}
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T04:04:02.620956Z",
"start_time": "2019-11-27T04:04:02.616396Z"
},
"trusted": true,
"id": "HfG6qrHYkRal",
"colab_type": "code",
"colab": {}
},
"source": [
"def concatenate(prefix=\"Acquisition\"):\n",
" files = os.listdir(DATA_DIR)\n",
" full = []\n",
" for f in files:\n",
" if not f.startswith(prefix):\n",
" continue\n",
"\n",
" data = pd.read_csv(\n",
" os.path.join(DATA_DIR, f),\n",
" sep=\"|\",\n",
" header=None,\n",
" names=HEADERS[prefix],\n",
" index_col=False,\n",
" )\n",
"\n",
" data = data[SELECT[prefix]]\n",
" full.append(data)\n",
"\n",
" full = pd.concat(full, axis=0)\n",
"\n",
" full.to_csv(\n",
" os.path.join(OUT, \"{}.txt\".format(prefix)),\n",
" sep=\"|\",\n",
" header=SELECT[prefix],\n",
" index=False,\n",
" )"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T04:05:06.991975Z",
"start_time": "2019-11-27T04:04:05.204812Z"
},
"trusted": true,
"id": "9_sX6refkRan",
"colab_type": "code",
"colab": {}
},
"source": [
"concatenate(\"Acquisition\")"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T04:22:48.150841Z",
"start_time": "2019-11-27T04:05:17.975196Z"
},
"trusted": true,
"id": "Otg4k5KgkRap",
"colab_type": "code",
"colab": {}
},
"source": [
"concatenate(\"Performance\")"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T03:51:06.550933Z",
"start_time": "2019-11-27T03:51:05.714072Z"
},
"trusted": true,
"id": "S5gDHijzkRar",
"colab_type": "code",
"colab": {}
},
"source": [
"!export MODIN_MEMORY=200000000000"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T04:01:55.016030Z",
"start_time": "2019-11-27T04:01:46.968221Z"
},
"trusted": true,
"id": "qna3SdRxkRat",
"colab_type": "code",
"colab": {}
},
"source": [
"df = pd.read_csv(os.path.join(\"./\", \"Acquisition.txt\"), sep=\"|\", low_memory=False)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T04:02:03.968766Z",
"start_time": "2019-11-27T04:02:03.949952Z"
},
"trusted": true,
"id": "UfMQj1RmkRaw",
"colab_type": "code",
"colab": {},
"outputId": "66a165eb-2609-4480-fd4a-36116a8fab61"
},
"source": [
"df.columns"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Index(['id', 'channel', 'seller', 'interest_rate', 'balance', 'loan_term',\n",
" 'origination_date', 'first_payment_date', 'ltv', 'cltv',\n",
" 'borrower_count', 'dti', 'borrower_credit_score',\n",
" 'first_time_homebuyer', 'loan_purpose', 'property_type', 'unit_count',\n",
" 'occupancy_status', 'property_state', 'zip', 'insurance_percentage',\n",
" 'product_type', 'co_borrower_credit_score', 'mortgage_insurance_type',\n",
" 'relocation_mortgage_indicator'],\n",
" dtype='object')"
]
},
"metadata": {
"tags": []
},
"execution_count": 191
}
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T03:54:47.905170Z",
"start_time": "2019-11-27T03:54:47.881227Z"
},
"trusted": true,
"id": "sHcI9cJNkRaz",
"colab_type": "code",
"colab": {}
},
"source": [
"import os\n",
"import pandas as pd\n",
"\n",
"\n",
"def read():\n",
" acquisition = pd.read_csv(os.path.join(\"./\", \"Acquisition.txt\"), sep=\"|\")\n",
" return acquisition\n",
"\n",
"\n",
"def count_performance_rows():\n",
" counts = {}\n",
" with open(os.path.join(\"./\", \"Performance.txt\"), \"r\") as f:\n",
" for i, line in enumerate(f):\n",
" if i == 0:\n",
" # Skip header row\n",
" continue\n",
" loan_id, date = line.split(\"|\")\n",
" loan_id = int(loan_id)\n",
" if loan_id not in counts:\n",
" counts[loan_id] = {\"foreclosure_status\": False, \"performance_count\": 0}\n",
" counts[loan_id][\"performance_count\"] += 1\n",
" if len(date.strip()) > 0:\n",
" counts[loan_id][\"foreclosure_status\"] = True\n",
" return counts\n",
"\n",
"\n",
"def get_performance_summary_value(loan_id, key, performance_summary):\n",
" value = performance_summary.get(\n",
" loan_id, {\"foreclosure_status\": False, \"performance_count\": 0}\n",
" )\n",
" return value[key]\n",
"\n",
"\n",
"def annotate(acquisition, performance_summary):\n",
" acquisition[\"foreclosure_status\"] = acquisition[\"id\"].apply(\n",
" lambda x: get_performance_summary_value(\n",
" x, \"foreclosure_status\", performance_summary\n",
" )\n",
" )\n",
" acquisition[\"performance_count\"] = acquisition[\"id\"].apply(\n",
" lambda x: get_performance_summary_value(\n",
" x, \"performance_count\", performance_summary\n",
" )\n",
" )\n",
" for column in [\n",
" \"channel\",\n",
" \"seller\",\n",
" \"first_time_homebuyer\",\n",
" \"loan_purpose\",\n",
" \"property_type\",\n",
" \"occupancy_status\",\n",
" \"property_state\",\n",
" \"product_type\",\n",
" ]:\n",
" acquisition[column] = acquisition[column].astype(\"category\")\n",
"\n",
" for start in [\"first_payment\", \"origination\"]:\n",
" column = \"{}_date\".format(start)\n",
" acquisition[\"{}_year\".format(start)] = pd.to_numeric(\n",
" acquisition[column].str.split(\"/\").str.get(1)\n",
" )\n",
" acquisition[\"{}_month\".format(start)] = pd.to_numeric(\n",
" acquisition[column].str.split(\"/\").str.get(0)\n",
" )\n",
" del acquisition[column]\n",
"\n",
" acquisition = acquisition[acquisition[\"performance_count\"] > 7]\n",
" return acquisition\n",
"\n",
"\n",
"def write(acquisition):\n",
" acquisition.to_csv(os.path.join(\"./\", \"train.csv\"), index=False)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T03:58:05.681984Z",
"start_time": "2019-11-27T03:54:49.208495Z"
},
"scrolled": true,
"trusted": true,
"id": "bYruJprJkRa2",
"colab_type": "code",
"colab": {},
"outputId": "d7db026d-a928-48d0-8565-bf275b2bcfb2"
},
"source": [
"acquisition = read()\n",
"performance_summary = count_performance_rows()\n",
"acquisition = annotate(acquisition, performance_summary)\n",
"write(acquisition)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"distributed.comm.tcp - WARNING - Closing dangling stream in <TCP local=tcp://127.0.0.1:56626 remote=tcp://127.0.0.1:44207>\n",
"distributed.comm.tcp - WARNING - Closing dangling stream in <TCP local=tcp://127.0.0.1:56628 remote=tcp://127.0.0.1:44207>\n",
"distributed.comm.tcp - WARNING - Closing dangling stream in <TCP local=tcp://127.0.0.1:56704 remote=tcp://127.0.0.1:44207>\n",
"distributed.comm.tcp - WARNING - Closing dangling stream in <TCP local=tcp://127.0.0.1:56816 remote=tcp://127.0.0.1:44207>\n",
"distributed.comm.tcp - WARNING - Closing dangling stream in <TCP local=tcp://127.0.0.1:56868 remote=tcp://127.0.0.1:44207>\n",
"distributed.comm.tcp - WARNING - Closing dangling stream in <TCP local=tcp://127.0.0.1:56920 remote=tcp://127.0.0.1:44207>\n",
"distributed.comm.tcp - WARNING - Closing dangling stream in <TCP local=tcp://127.0.0.1:56964 remote=tcp://127.0.0.1:44207>\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T05:10:02.171631Z",
"start_time": "2019-11-27T05:09:54.951619Z"
},
"trusted": true,
"id": "ddqbrozSkRa4",
"colab_type": "code",
"colab": {}
},
"source": [
"pp = pd.read_csv(\"./train.csv\")"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T05:10:04.125342Z",
"start_time": "2019-11-27T05:10:03.991490Z"
},
"trusted": true,
"id": "Ux8BsJOekRa7",
"colab_type": "code",
"colab": {},
"outputId": "3a6879aa-790e-489a-9383-81abc6a2c798"
},
"source": [
"pp.query(\"foreclosure_status == True\")"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>channel</th>\n",
" <th>seller</th>\n",
" <th>interest_rate</th>\n",
" <th>balance</th>\n",
" <th>loan_term</th>\n",
" <th>ltv</th>\n",
" <th>cltv</th>\n",
" <th>borrower_count</th>\n",
" <th>dti</th>\n",
" <th>...</th>\n",
" <th>product_type</th>\n",
" <th>co_borrower_credit_score</th>\n",
" <th>mortgage_insurance_type</th>\n",
" <th>relocation_mortgage_indicator</th>\n",
" <th>foreclosure_status</th>\n",
" <th>performance_count</th>\n",
" <th>first_payment_year</th>\n",
" <th>first_payment_month</th>\n",
" <th>origination_year</th>\n",
" <th>origination_month</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>504</th>\n",
" <td>100919899030</td>\n",
" <td>R</td>\n",
" <td>OTHER</td>\n",
" <td>4.125</td>\n",
" <td>178000</td>\n",
" <td>360</td>\n",
" <td>97</td>\n",
" <td>97</td>\n",
" <td>1</td>\n",
" <td>27.0</td>\n",
" <td>...</td>\n",
" <td>FRM</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" <td>N</td>\n",
" <td>True</td>\n",
" <td>17</td>\n",
" <td>2016</td>\n",
" <td>6</td>\n",
" <td>2016</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1460</th>\n",
" <td>102526639505</td>\n",
" <td>R</td>\n",
" <td>OTHER</td>\n",
" <td>5.125</td>\n",
" <td>446000</td>\n",
" <td>360</td>\n",
" <td>90</td>\n",
" <td>90</td>\n",
" <td>1</td>\n",
" <td>45.0</td>\n",
" <td>...</td>\n",
" <td>FRM</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>N</td>\n",
" <td>True</td>\n",
" <td>23</td>\n",
" <td>2016</td>\n",
" <td>8</td>\n",
" <td>2016</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7979</th>\n",
" <td>114064133628</td>\n",
" <td>R</td>\n",
" <td>OTHER</td>\n",
" <td>4.125</td>\n",
" <td>71000</td>\n",
" <td>360</td>\n",
" <td>63</td>\n",
" <td>63</td>\n",
" <td>2</td>\n",
" <td>28.0</td>\n",
" <td>...</td>\n",
" <td>FRM</td>\n",
" <td>634.0</td>\n",
" <td>NaN</td>\n",
" <td>N</td>\n",
" <td>True</td>\n",
" <td>36</td>\n",
" <td>2016</td>\n",
" <td>5</td>\n",
" <td>2016</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9097</th>\n",
" <td>115995140179</td>\n",
" <td>R</td>\n",
" <td>LOANDEPOT.COM, LLC</td>\n",
" <td>4.375</td>\n",
" <td>198000</td>\n",
" <td>360</td>\n",
" <td>80</td>\n",
" <td>80</td>\n",
" <td>1</td>\n",
" <td>39.0</td>\n",
" <td>...</td>\n",
" <td>FRM</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>N</td>\n",
" <td>True</td>\n",
" <td>23</td>\n",
" <td>2016</td>\n",
" <td>6</td>\n",
" <td>2016</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12532</th>\n",
" <td>122159215261</td>\n",
" <td>R</td>\n",
" <td>WELLS FARGO BANK, N.A.</td>\n",
" <td>4.250</td>\n",
" <td>125000</td>\n",
" <td>360</td>\n",
" <td>71</td>\n",
" <td>71</td>\n",
" <td>1</td>\n",
" <td>30.0</td>\n",
" <td>...</td>\n",
" <td>FRM</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>N</td>\n",
" <td>True</td>\n",
" <td>29</td>\n",
" <td>2016</td>\n",
" <td>6</td>\n",
" <td>2016</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4086263</th>\n",
" <td>982336827813</td>\n",
" <td>R</td>\n",
" <td>FRANKLIN AMERICAN MORTGAGE COMPANY</td>\n",
" <td>4.250</td>\n",
" <td>70000</td>\n",
" <td>240</td>\n",
" <td>86</td>\n",
" <td>86</td>\n",
" <td>1</td>\n",
" <td>42.0</td>\n",
" <td>...</td>\n",
" <td>FRM</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>N</td>\n",
" <td>True</td>\n",
" <td>22</td>\n",
" <td>2017</td>\n",
" <td>5</td>\n",
" <td>2017</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4089682</th>\n",
" <td>989347753728</td>\n",
" <td>R</td>\n",
" <td>OTHER</td>\n",
" <td>4.625</td>\n",
" <td>54000</td>\n",
" <td>360</td>\n",
" <td>91</td>\n",
" <td>97</td>\n",
" <td>1</td>\n",
" <td>40.0</td>\n",
" <td>...</td>\n",
" <td>FRM</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>N</td>\n",
" <td>True</td>\n",
" <td>19</td>\n",
" <td>2017</td>\n",
" <td>5</td>\n",
" <td>2017</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4090723</th>\n",
" <td>991527843261</td>\n",
" <td>R</td>\n",
" <td>OTHER</td>\n",
" <td>4.625</td>\n",
" <td>360000</td>\n",
" <td>360</td>\n",
" <td>80</td>\n",
" <td>80</td>\n",
" <td>1</td>\n",
" <td>26.0</td>\n",
" <td>...</td>\n",
" <td>FRM</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>N</td>\n",
" <td>True</td>\n",
" <td>21</td>\n",
" <td>2017</td>\n",
" <td>6</td>\n",
" <td>2017</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4093356</th>\n",
" <td>997017995239</td>\n",
" <td>C</td>\n",
" <td>U.S. BANK N.A.</td>\n",
" <td>5.500</td>\n",
" <td>167000</td>\n",
" <td>360</td>\n",
" <td>95</td>\n",
" <td>95</td>\n",
" <td>1</td>\n",
" <td>32.0</td>\n",
" <td>...</td>\n",
" <td>FRM</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>N</td>\n",
" <td>True</td>\n",
" <td>12</td>\n",
" <td>2017</td>\n",
" <td>5</td>\n",
" <td>2017</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4094283</th>\n",
" <td>998998046089</td>\n",
" <td>R</td>\n",
" <td>OTHER</td>\n",
" <td>4.500</td>\n",
" <td>56000</td>\n",
" <td>240</td>\n",
" <td>68</td>\n",
" <td>68</td>\n",
" <td>1</td>\n",
" <td>34.0</td>\n",
" <td>...</td>\n",
" <td>FRM</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>N</td>\n",
" <td>True</td>\n",
" <td>16</td>\n",
" <td>2017</td>\n",
" <td>5</td>\n",
" <td>2017</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1429 rows × 29 columns</p>\n",
"</div>"
],
"text/plain": [
" id channel seller \\\n",
"504 100919899030 R OTHER \n",
"1460 102526639505 R OTHER \n",
"7979 114064133628 R OTHER \n",
"9097 115995140179 R LOANDEPOT.COM, LLC \n",
"12532 122159215261 R WELLS FARGO BANK, N.A. \n",
"... ... ... ... \n",
"4086263 982336827813 R FRANKLIN AMERICAN MORTGAGE COMPANY \n",
"4089682 989347753728 R OTHER \n",
"4090723 991527843261 R OTHER \n",
"4093356 997017995239 C U.S. BANK N.A. \n",
"4094283 998998046089 R OTHER \n",
"\n",
" interest_rate balance loan_term ltv cltv borrower_count dti \\\n",
"504 4.125 178000 360 97 97 1 27.0 \n",
"1460 5.125 446000 360 90 90 1 45.0 \n",
"7979 4.125 71000 360 63 63 2 28.0 \n",
"9097 4.375 198000 360 80 80 1 39.0 \n",
"12532 4.250 125000 360 71 71 1 30.0 \n",
"... ... ... ... ... ... ... ... \n",
"4086263 4.250 70000 240 86 86 1 42.0 \n",
"4089682 4.625 54000 360 91 97 1 40.0 \n",
"4090723 4.625 360000 360 80 80 1 26.0 \n",
"4093356 5.500 167000 360 95 95 1 32.0 \n",
"4094283 4.500 56000 240 68 68 1 34.0 \n",
"\n",
" ... product_type co_borrower_credit_score mortgage_insurance_type \\\n",
"504 ... FRM NaN 2.0 \n",
"1460 ... FRM NaN 1.0 \n",
"7979 ... FRM 634.0 NaN \n",
"9097 ... FRM NaN NaN \n",
"12532 ... FRM NaN NaN \n",
"... ... ... ... ... \n",
"4086263 ... FRM NaN 1.0 \n",
"4089682 ... FRM NaN 1.0 \n",
"4090723 ... FRM NaN NaN \n",
"4093356 ... FRM NaN 1.0 \n",
"4094283 ... FRM NaN NaN \n",
"\n",
" relocation_mortgage_indicator foreclosure_status performance_count \\\n",
"504 N True 17 \n",
"1460 N True 23 \n",
"7979 N True 36 \n",
"9097 N True 23 \n",
"12532 N True 29 \n",
"... ... ... ... \n",
"4086263 N True 22 \n",
"4089682 N True 19 \n",
"4090723 N True 21 \n",
"4093356 N True 12 \n",
"4094283 N True 16 \n",
"\n",
" first_payment_year first_payment_month origination_year \\\n",
"504 2016 6 2016 \n",
"1460 2016 8 2016 \n",
"7979 2016 5 2016 \n",
"9097 2016 6 2016 \n",
"12532 2016 6 2016 \n",
"... ... ... ... \n",
"4086263 2017 5 2017 \n",
"4089682 2017 5 2017 \n",
"4090723 2017 6 2017 \n",
"4093356 2017 5 2017 \n",
"4094283 2017 5 2017 \n",
"\n",
" origination_month \n",
"504 4 \n",
"1460 6 \n",
"7979 3 \n",
"9097 4 \n",
"12532 4 \n",
"... ... \n",
"4086263 3 \n",
"4089682 3 \n",
"4090723 4 \n",
"4093356 3 \n",
"4094283 3 \n",
"\n",
"[1429 rows x 29 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 202
}
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T05:23:10.898241Z",
"start_time": "2019-11-27T05:23:09.620262Z"
},
"trusted": true,
"id": "hIbfeQk7kRa9",
"colab_type": "code",
"colab": {},
"outputId": "b6e06175-acd8-4bec-b374-4be2935e8ab2"
},
"source": [
"plt.figure(figsize=(15, 20))\n",
"df = pd.read_csv(\"train.csv\", nrows=200000)\n",
"df = df.fillna(-1)\n",
"\n",
"\n",
"plt.subplot(231)\n",
"sns.heatmap(\n",
" pd.DataFrame(df.isnull().sum() / df.shape[0] * 100),\n",
" annot=True,\n",
" cmap=sns.color_palette(\"cool\"),\n",
" linewidth=1,\n",
" linecolor=\"white\",\n",
")\n",
"plt.title(\"ALL ZEROS\")"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Text(0.5, 1.0, 'ALL ZEROS')"
]
},
"metadata": {
"tags": []
},
"execution_count": 204
},
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 1080x1440 with 2 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "zAZ_ZDThkRa_",
"colab_type": "text"
},
"source": [
"Now that we have two years of data. Let's do some good old fashiond data mungung. \n",
"\n",
"- First we have to balace out this dataset. There are 1429 foreclosure_status as true in the two years worth of data. And I still ahave to check the dates of contract execution. I would love for there to be an qual amount of true and folse, but that's not how life works. So we're going to do wide. What I mean by that is break out the feature engineeing skills and trurn those 29 colums into hundreds. First. Let's munge. \n",
"\n",
"1. What's the goal here. We obviously have to remove the performance_count as that' will allow 100% accuracy and we want to creaet a dataset that would be similar of not exatly wo the data a bank or llnder wound have at the tim e od decision making. "
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T02:30:06.030427Z",
"start_time": "2019-11-27T02:15:49.004Z"
},
"trusted": true,
"id": "_vLXxBVzkRbA",
"colab_type": "code",
"colab": {}
},
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"\n",
"df = pd.read_csv(\"train.csv\", nrows=1000000)\n",
"df.fillna(0, inplace=True)\n",
"\n",
"df.pop(\"performance_count\")\n",
"\n",
"predictors = df.columns.tolist()\n",
"predictors = [p for p in predictors if p not in NON_PREDICTORS]\n",
"\n",
"\n",
"cols = list(df.columns.values) # Make a list of all of the columns in the df\n",
"cols.pop(cols.index(\"foreclosure_status\")) # Remove class from list\n",
"\n",
"\n",
"df = df[cols + [\"foreclosure_status\"]]\n",
"\n",
"\n",
"# df[\"target\"] =df['foreclosure_status']\n",
"train_cols = df.columns[0:-1]\n",
"label = df.columns[-1:]\n",
"\n",
"\n",
"X = df[predictors]\n",
"y = df[TARGET]"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T02:30:06.041180Z",
"start_time": "2019-11-27T02:15:49.545Z"
},
"trusted": true,
"id": "fAXQzMtKkRbC",
"colab_type": "code",
"colab": {}
},
"source": [
"seed = 1\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y, test_size=0.20, random_state=seed\n",
")"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T02:30:06.049836Z",
"start_time": "2019-11-27T02:15:51.955Z"
},
"trusted": true,
"id": "GV79FepEkRbE",
"colab_type": "code",
"colab": {}
},
"source": [
"TableDisplay(X_train.head(30))\n",
"X_train.columns"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"trusted": true,
"id": "sf8XVsTUkRbG",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T02:30:06.058633Z",
"start_time": "2019-11-27T02:15:53.148Z"
},
"trusted": true,
"id": "KTsWHSuikRbI",
"colab_type": "code",
"colab": {}
},
"source": [
"from interpret.provider import InlineProvider\n",
"from interpret import set_visualize_provider\n",
"\n",
"set_visualize_provider(InlineProvider())"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T02:30:06.066792Z",
"start_time": "2019-11-27T02:15:54.241Z"
},
"trusted": true,
"id": "hNAWnnkhkRbL",
"colab_type": "code",
"colab": {}
},
"source": [
"from interpret.glassbox import ExplainableBoostingClassifier\n",
"\n",
"ebm = ExplainableBoostingClassifier()\n",
"ebm.fit(X_train, y_train)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T02:30:06.079971Z",
"start_time": "2019-11-27T02:16:15.055Z"
},
"trusted": true,
"id": "nPORCIkikRbN",
"colab_type": "code",
"colab": {}
},
"source": [
"import pickle\n",
"\n",
"pkl_filename = \"fannie_model.pkl\"\n",
"with open(pkl_filename, \"wb\") as file:\n",
" pickle.dump(ebm, file)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"trusted": true,
"id": "rb17_qp_kRbQ",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T02:30:06.088354Z",
"start_time": "2019-11-27T02:16:15.704Z"
},
"trusted": true,
"id": "v7jhFT9ukRbT",
"colab_type": "code",
"colab": {}
},
"source": [
"from interpret import show\n",
"from interpret.data import ClassHistogram"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T02:30:06.106914Z",
"start_time": "2019-11-27T02:16:16.700Z"
},
"trusted": true,
"id": "fNc1cAWokRbU",
"colab_type": "code",
"colab": {}
},
"source": [
"ebm_global = ebm.explain_global(name=\"EBM\")\n",
"show(ebm_global)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T02:30:06.118000Z",
"start_time": "2019-11-27T02:16:17.698Z"
},
"trusted": true,
"id": "hNVitomlkRbW",
"colab_type": "code",
"colab": {}
},
"source": [
"from interpret import show\n",
"from interpret.data import ClassHistogram\n",
"\n",
"hist = ClassHistogram().explain_data(X_train, y_train, name=\"Train Data\")\n",
"show(hist)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T02:30:06.126851Z",
"start_time": "2019-11-27T02:16:18.789Z"
},
"trusted": true,
"id": "G0TsBbUdkRbY",
"colab_type": "code",
"colab": {}
},
"source": [
"ebm_local = ebm.explain_local(X_test[:1000], y_test[:1000], name=\"EBM\")\n",
"show(ebm_local)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T02:30:06.134674Z",
"start_time": "2019-11-27T02:16:19.508Z"
},
"trusted": true,
"id": "eDbYUH_FkRbb",
"colab_type": "code",
"colab": {}
},
"source": [
"from interpret.perf import ROC\n",
"from treeinterpreter import treeinterpreter as ti\n",
"\n",
"ebm_perf = ROC(ebm.predict_proba).explain_perf(X_test, y_test, name=\"EBM\")\n",
"\n",
"show(ebm_perf)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T02:30:06.143234Z",
"start_time": "2019-11-27T02:16:20.972Z"
},
"trusted": true,
"id": "Z56rGrwlkRbd",
"colab_type": "code",
"colab": {}
},
"source": [
"from interpret.glassbox import LogisticRegression, ClassificationTree\n",
"\n",
"# We have to transform categorical variables to use Logistic Regression and Decision Tree\n",
"X_enc = pd.get_dummies(X, prefix_sep=\".\")\n",
"feature_names = list(X_enc.columns)\n",
"X_train_enc, X_test_enc, y_train, y_test = train_test_split(\n",
" X_enc, y, test_size=0.20, random_state=seed\n",
")\n",
"\n",
"lr = LogisticRegression(random_state=seed, feature_names=feature_names, penalty=\"l1\")\n",
"lr.fit(X_train_enc, y_train)\n",
"\n",
"tree = ClassificationTree()\n",
"tree.fit(X_train_enc, y_train)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T02:30:06.152741Z",
"start_time": "2019-11-27T02:16:21.837Z"
},
"trusted": true,
"id": "h1LJdIXPkRbg",
"colab_type": "code",
"colab": {}
},
"source": [
"from beakerx import *\n",
"\n",
"lr_perf = ROC(lr.predict_proba).explain_perf(\n",
" X_test_enc, y_test, name=\"Logistic Regression\"\n",
")\n",
"tree_perf = ROC(tree.predict_proba).explain_perf(\n",
" X_test_enc, y_test, name=\"Classification Tree\"\n",
")\n",
"\n",
"\n",
"l = TabbedOutputContainerLayoutManager()\n",
"l.setBorderDisplayed(False)\n",
"o = OutputContainer()\n",
"o.setLayoutManager(l)\n",
"o.addItem(show(lr_perf), \"Logistic Regressio\")\n",
"o.addItem(show(tree_perf), \"Classification Tree\")\n",
"o.addItem(show(ebm_perf), \"ebm\")\n",
"\n",
"o"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-27T02:30:06.161684Z",
"start_time": "2019-11-27T02:16:22.787Z"
},
"trusted": true,
"id": "blPJxnF-kRbj",
"colab_type": "code",
"colab": {}
},
"source": [
"lr_global = lr.explain_global(name=\"LR\")\n",
"tree_global = tree.explain_global(name=\"Tree\")\n",
"\n",
"show(lr_global)\n",
"show(tree_global)\n",
"show(ebm_global)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"trusted": true,
"id": "Wh759-RHkRbl",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"trusted": true,
"id": "6xsQCE8zkRb2",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"trusted": true,
"id": "Ncs4dcg2kRb3",
"colab_type": "code",
"colab": {}
},
"source": [
"# Explainers:\n",
"# 1. SHAP Tabular Explainer\n",
"from interpret.ext.blackbox import TabularExplainer\n",
"\n",
"# OR\n",
"\n",
"# 2. Mimic Explainer\n",
"from interpret.ext.blackbox import MimicExplainer\n",
"\n",
"# You can use one of the following four interpretable models as a global surrogate to the black box model\n",
"from interpret.ext.glassbox import LGBMExplainableModel\n",
"from interpret.ext.glassbox import LinearExplainableModel\n",
"from interpret.ext.glassbox import SGDExplainableModel\n",
"from interpret.ext.glassbox import DecisionTreeExplainableModel\n",
"\n",
"# OR\n",
"\n",
"# 3. PFI Explainer\n",
"from interpret.ext.blackbox import PFIExplainer"
],
"execution_count": 0,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment