Created
January 18, 2019 02:38
-
-
Save jonathanedelman/ba61dcdac4a68eedf3cbe550438c6acc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from demyst.analytics import Analytics \n", | |
"import pandas as pd\n", | |
"\n", | |
"analytics = Analytics()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# read in the full file\n", | |
"original_sample = pd.read_csv(\"/Users/jonathanedelman/Downloads/FOIA - 7(a)(FY2010-Present).csv\", encoding=\"latin-1\", low_memory=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Filtering the original file down to an N of 5000. \n", | |
"# 1000 chargeoffs and 4000 paid in full \n", | |
"\n", | |
"# Change this number for a smaller sample\n", | |
"N = 5000\n", | |
"# Take the more recent loans\n", | |
"recent_sample = original_sample.loc[ 2016 <= original_sample[\"ApprovalFiscalYear\"]]\n", | |
"# Identify the charge offs and the paid in full\n", | |
"recent_charge_offs = recent_sample.loc[original_sample[\"LoanStatus\"] == \"CHGOFF\"]\n", | |
"recent_paid_in_fulls = recent_sample.loc[original_sample[\"LoanStatus\"] == \"PIF\"]\n", | |
"# Take 1/5 of the sample as charge offs, and 4/5 as paid in full\n", | |
"recent_charge_off_sample = recent_charge_offs.sample(int(N/5))\n", | |
"recent_paid_in_full_sample = recent_paid_in_fulls.sample(int(N*4/5))\n", | |
"sample = pd.concat([recent_charge_off_sample, recent_paid_in_full_sample])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Make binary variable binary numbers\n", | |
"sample.loc[sample['LoanStatus'] == \"CHGOFF\", 'LoanStatus'] = 1\n", | |
"sample.loc[sample['LoanStatus'] == \"PIF\", 'LoanStatus'] = 0" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Define Source List\n", | |
"source_list = ['attom_avm', 'attom_commercial_avm', 'attom_pre_foreclosure', 'attom_commercial_pre_foreclosure', \n", | |
" 'attom_tax_assessor', 'attom_commercial_tax_assessor', 'info_connect_company', 'google_latlon', \n", | |
" 'infutor_property_append', 'utilityscore_bill', 'utilityscore_savings', 'dnb_find_company', \n", | |
" 'acxiom_place', 'walkscore', 'housecanary_property_mortgage_lien', 'bing'\n", | |
"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Doing a single lookup\n", | |
"test_attempt_results = analytics.enrich_and_download(source_list, sample[:1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"sample.columns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# renaming existing column names to demyst types where applicable\n", | |
"sample.rename(columns = {'BorrName':'business_name'}, inplace = True)\n", | |
"sample.rename(columns = {'BorrStreet':'street'}, inplace = True)\n", | |
"sample.rename(columns = {'BorrCity':'city'}, inplace = True)\n", | |
"sample.rename(columns = {'BorrState':'state'}, inplace = True)\n", | |
"sample.rename(columns = {'BorrZip':'post_code'}, inplace = True)\n", | |
"\n", | |
"# add country, since it is common and easy\n", | |
"sample['country'] = \"us\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"analytics.validate2(sample)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Follow advice of the validate2 output\n", | |
"sample['post_code'] = sample['post_code'].astype(str)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"analytics.validate(sample, providers=source_list)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# as to not waste money when doing screenshots\n", | |
"# sample = sample[1:5]\n", | |
"sample" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Bing requires a \"query\" column\n", | |
"sample['query'] = sample['business_name'] + \" \" + sample[\"city\"] + \", \" + sample[\"state\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Append Attom ID for attom products\n", | |
"\n", | |
"attom_results = analytics.enrich_and_download(['attom_expanded_profile_report'], sample)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"sample['attom_id'] = attom_results['attom_expanded_profile_report.attom_id']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Append latitude and longitude for Walkscore\n", | |
"\n", | |
"google_latlon_inputs = pd.DataFrame(columns=['street', 'country'])\n", | |
"\n", | |
"# For google_latlon, combine all address elements for the street column. For most Demyst providers, street is simply the street address.\n", | |
"google_latlon_inputs['street'] = sample[\"street\"] + \" \" + sample[\"city\"] + \", \" + sample[\"state\"] + \" \" + sample[\"post_code\"].map(str)\n", | |
"\n", | |
"# We are using US Businesses Only\n", | |
"google_latlon_inputs['country'] = 'us'\n", | |
"\n", | |
"google_latlon_results = analytics.enrich_and_download(['google_latlon' ], google_latlon_inputs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"sample['longitude'] = google_latlon_results['google_latlon.longitude']\n", | |
"sample['latitude'] = google_latlon_results['google_latlon.latitude']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Already appended google_latlon, so do not need to do it again.\n", | |
"updated_source_list = [\n", | |
" 'attom_avm', 'attom_commercial_avm', 'attom_pre_foreclosure', 'attom_commercial_pre_foreclosure', \n", | |
" 'attom_tax_assessor', 'attom_commercial_tax_assessor', 'info_connect_company', \n", | |
" 'infutor_property_append', 'utilityscore_bill', 'utilityscore_savings', 'dnb_find_company', \n", | |
" 'acxiom_place', 'walkscore', 'housecanary_property_mortgage_lien', 'bing'\n", | |
"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"results = analytics.enrich_and_download(source_list, sample, validate=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Create 2 data frames: 1 with the residentia data, and one with the commercial data.\n", | |
"attom_avm_results = results.filter(regex='attom_avm', axis=1)\n", | |
"attom_pre_foreclosure_results = results.filter(regex='attom_pre_foreclosure', axis=1)\n", | |
"attom_tax_assessor_results = results.filter(regex='attom_tax_assessor', axis=1)\n", | |
"attom_residential = pd.concat([attom_avm_results, attom_pre_foreclosure_results, attom_tax_assessor_results], axis=1)\n", | |
"\n", | |
"attom_commercial_avm_results = results.filter(regex='attom_commercial_avm', axis=1)\n", | |
"attom_commercial_pre_foreclosure_results = results.filter(regex='attom_commercial_pre_foreclosure', axis=1)\n", | |
"attom_commercial_tax_assessor_results = results.filter(regex='attom_commercial_tax_assessor_results', axis=1)\n", | |
"attom_commercial = pd.concat([attom_commercial_avm_results, attom_commercial_pre_foreclosure_results, attom_commercial_tax_assessor_results], axis=1)\n", | |
"\n", | |
"# Rename the columns so we have identical columns from the two data frames\n", | |
"attom_residential.columns = attom_residential.columns.str.replace(\"attom_\", \"attom_combined_\")\n", | |
"attom_commercial.columns = attom_commercial.columns.str.replace(\"attom_commercial_\", \"attom_combined_\")\n", | |
"\n", | |
"# Do a combine_first \n", | |
"attom_combined = attom_commercial.combine_first(attom_residential)\n", | |
"\n", | |
"# Remove original attom columns from original results\n", | |
"attom_less_results = results[results.columns.drop(list(results.filter(regex='attom_')))]\n", | |
"\n", | |
"# Combine attom results with the others (and google latlon from earlier)\n", | |
"recombined_results = pd.concat([attom_less_results, attom_combined, google_latlon_results], axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"no_dups = recombined_results.loc[:, ~recombined_results.columns.duplicated()]\n", | |
"no_dups.to_csv(\"data_robot_input.csv\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment