Skip to content

Instantly share code, notes, and snippets.

@jonathanedelman
Created January 18, 2019 02:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jonathanedelman/ba61dcdac4a68eedf3cbe550438c6acc to your computer and use it in GitHub Desktop.
Save jonathanedelman/ba61dcdac4a68eedf3cbe550438c6acc to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from demyst.analytics import Analytics \n",
"import pandas as pd\n",
"\n",
"analytics = Analytics()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# read in the full file\n",
"original_sample = pd.read_csv(\"/Users/jonathanedelman/Downloads/FOIA - 7(a)(FY2010-Present).csv\", encoding=\"latin-1\", low_memory=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Filtering the original file down to an N of 5000. \n",
"# 1000 chargeoffs and 4000 paid in full \n",
"\n",
"# Change this number for a smaller sample\n",
"N = 5000\n",
"# Take the more recent loans\n",
"recent_sample = original_sample.loc[ 2016 <= original_sample[\"ApprovalFiscalYear\"]]\n",
"# Identify the charge offs and the paid in full\n",
"recent_charge_offs = recent_sample.loc[original_sample[\"LoanStatus\"] == \"CHGOFF\"]\n",
"recent_paid_in_fulls = recent_sample.loc[original_sample[\"LoanStatus\"] == \"PIF\"]\n",
"# Take 1/5 of the sample as charge offs, and 4/5 as paid in full\n",
"recent_charge_off_sample = recent_charge_offs.sample(int(N/5))\n",
"recent_paid_in_full_sample = recent_paid_in_fulls.sample(int(N*4/5))\n",
"sample = pd.concat([recent_charge_off_sample, recent_paid_in_full_sample])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Make binary variable binary numbers\n",
"sample.loc[sample['LoanStatus'] == \"CHGOFF\", 'LoanStatus'] = 1\n",
"sample.loc[sample['LoanStatus'] == \"PIF\", 'LoanStatus'] = 0"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Define Source List\n",
"source_list = ['attom_avm', 'attom_commercial_avm', 'attom_pre_foreclosure', 'attom_commercial_pre_foreclosure', \n",
" 'attom_tax_assessor', 'attom_commercial_tax_assessor', 'info_connect_company', 'google_latlon', \n",
" 'infutor_property_append', 'utilityscore_bill', 'utilityscore_savings', 'dnb_find_company', \n",
" 'acxiom_place', 'walkscore', 'housecanary_property_mortgage_lien', 'bing'\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Doing a single lookup\n",
"test_attempt_results = analytics.enrich_and_download(source_list, sample[:1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sample.columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# renaming existing column names to demyst types where applicable\n",
"sample.rename(columns = {'BorrName':'business_name'}, inplace = True)\n",
"sample.rename(columns = {'BorrStreet':'street'}, inplace = True)\n",
"sample.rename(columns = {'BorrCity':'city'}, inplace = True)\n",
"sample.rename(columns = {'BorrState':'state'}, inplace = True)\n",
"sample.rename(columns = {'BorrZip':'post_code'}, inplace = True)\n",
"\n",
"# add country, since it is common and easy\n",
"sample['country'] = \"us\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"analytics.validate2(sample)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Follow advice of the validate2 output\n",
"sample['post_code'] = sample['post_code'].astype(str)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"analytics.validate(sample, providers=source_list)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# as to not waste money when doing screenshots\n",
"# sample = sample[1:5]\n",
"sample"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Bing requires a \"query\" column\n",
"sample['query'] = sample['business_name'] + \" \" + sample[\"city\"] + \", \" + sample[\"state\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Append Attom ID for attom products\n",
"\n",
"attom_results = analytics.enrich_and_download(['attom_expanded_profile_report'], sample)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sample['attom_id'] = attom_results['attom_expanded_profile_report.attom_id']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Append latitude and longitude for Walkscore\n",
"\n",
"google_latlon_inputs = pd.DataFrame(columns=['street', 'country'])\n",
"\n",
"# For google_latlon, combine all address elements for the street column. For most Demyst providers, street is simply the street address.\n",
"google_latlon_inputs['street'] = sample[\"street\"] + \" \" + sample[\"city\"] + \", \" + sample[\"state\"] + \" \" + sample[\"post_code\"].map(str)\n",
"\n",
"# We are using US Businesses Only\n",
"google_latlon_inputs['country'] = 'us'\n",
"\n",
"google_latlon_results = analytics.enrich_and_download(['google_latlon' ], google_latlon_inputs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sample['longitude'] = google_latlon_results['google_latlon.longitude']\n",
"sample['latitude'] = google_latlon_results['google_latlon.latitude']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Already appended google_latlon, so do not need to do it again.\n",
"updated_source_list = [\n",
" 'attom_avm', 'attom_commercial_avm', 'attom_pre_foreclosure', 'attom_commercial_pre_foreclosure', \n",
" 'attom_tax_assessor', 'attom_commercial_tax_assessor', 'info_connect_company', \n",
" 'infutor_property_append', 'utilityscore_bill', 'utilityscore_savings', 'dnb_find_company', \n",
" 'acxiom_place', 'walkscore', 'housecanary_property_mortgage_lien', 'bing'\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"results = analytics.enrich_and_download(source_list, sample, validate=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create 2 data frames: 1 with the residentia data, and one with the commercial data.\n",
"attom_avm_results = results.filter(regex='attom_avm', axis=1)\n",
"attom_pre_foreclosure_results = results.filter(regex='attom_pre_foreclosure', axis=1)\n",
"attom_tax_assessor_results = results.filter(regex='attom_tax_assessor', axis=1)\n",
"attom_residential = pd.concat([attom_avm_results, attom_pre_foreclosure_results, attom_tax_assessor_results], axis=1)\n",
"\n",
"attom_commercial_avm_results = results.filter(regex='attom_commercial_avm', axis=1)\n",
"attom_commercial_pre_foreclosure_results = results.filter(regex='attom_commercial_pre_foreclosure', axis=1)\n",
"attom_commercial_tax_assessor_results = results.filter(regex='attom_commercial_tax_assessor_results', axis=1)\n",
"attom_commercial = pd.concat([attom_commercial_avm_results, attom_commercial_pre_foreclosure_results, attom_commercial_tax_assessor_results], axis=1)\n",
"\n",
"# Rename the columns so we have identical columns from the two data frames\n",
"attom_residential.columns = attom_residential.columns.str.replace(\"attom_\", \"attom_combined_\")\n",
"attom_commercial.columns = attom_commercial.columns.str.replace(\"attom_commercial_\", \"attom_combined_\")\n",
"\n",
"# Do a combine_first \n",
"attom_combined = attom_commercial.combine_first(attom_residential)\n",
"\n",
"# Remove original attom columns from original results\n",
"attom_less_results = results[results.columns.drop(list(results.filter(regex='attom_')))]\n",
"\n",
"# Combine attom results with the others (and google latlon from earlier)\n",
"recombined_results = pd.concat([attom_less_results, attom_combined, google_latlon_results], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"no_dups = recombined_results.loc[:, ~recombined_results.columns.duplicated()]\n",
"no_dups.to_csv(\"data_robot_input.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment