Skip to content

Instantly share code, notes, and snippets.

@knu2xs
Created April 27, 2021 22:43
Show Gist options
  • Save knu2xs/a67d0ef66a008bdec6d5f067f05fceb7 to your computer and use it in GitHub Desktop.
Save knu2xs/a67d0ef66a008bdec6d5f067f05fceb7 to your computer and use it in GitHub Desktop.
Enrich block groups using all variables directly.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "2d188ac5-2f84-4006-b8c4-4865fab342b5",
"metadata": {
"slideshow": {
"slide_type": "skip"
},
"tags": []
},
"outputs": [],
"source": [
"from functools import reduce\n",
"from pathlib import Path\n",
"\n",
"from arcgis.features import GeoAccessor\n",
"import arcpy\n",
"from modeling import Country\n",
"from modeling._registry import get_ba_data_dir_path\n",
"import pandas as pd\n",
"import pyarrow as pa\n",
"import pyarrow.parquet as pq\n",
"\n",
"dir_prj = Path.cwd().parent\n",
"dir_data = dir_prj/'data'\n",
"dir_raw = dir_data/'raw'\n",
"dir_int = dir_data/'interim'\n",
"gdb_raw = dir_raw/'raw.gdb'\n",
"gdb_int = dir_int/'interim.gdb'"
]
},
{
"cell_type": "markdown",
"id": "20e23276-c99b-4096-8a7b-ee1620c2ee05",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"source": [
"### Get the Location Where Data Resides\n",
"\n",
"This uses a function I created to read from the registry to know where to look for the Business Analyst data. If you prefer, this can easily be manually created if you know where Business Analyst (all 50+ GB of it) data is installed in your local machine."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "45d8cae6-e3bf-4810-885e-3ae5310df1e2",
"metadata": {
"slideshow": {
"slide_type": "fragment"
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"WindowsPath('D:/arcgis/ba_data/us_2020')"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ba_dir = get_ba_data_dir_path()\n",
"\n",
"ba_dir"
]
},
{
"cell_type": "markdown",
"id": "5b5c8d6a-f21d-43f5-81cc-caea4467dd66",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"source": [
"### Get Enrichment Variables\n",
"\n",
"This uses a capability of Demographic Modeling to get a dataframe of all enrichment variables available."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "f614ee35-ca5d-4943-82c2-cff3ecf33140",
"metadata": {
"slideshow": {
"slide_type": "subslide"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 8276 entries, 0 to 8275\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 name 8276 non-null object\n",
" 1 alias 8276 non-null object\n",
" 2 data_collection 8276 non-null object\n",
" 3 enrich_name 8276 non-null object\n",
" 4 enrich_field_name 8276 non-null object\n",
"dtypes: object(5)\n",
"memory usage: 323.4+ KB\n",
"None\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>alias</th>\n",
" <th>data_collection</th>\n",
" <th>enrich_name</th>\n",
" <th>enrich_field_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AGE0_CY</td>\n",
" <td>2020 Population Age &lt;1</td>\n",
" <td>1yearincrements</td>\n",
" <td>1yearincrements.AGE0_CY</td>\n",
" <td>F1yearincrements_AGE0_CY</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AGE1_CY</td>\n",
" <td>2020 Population Age 1</td>\n",
" <td>1yearincrements</td>\n",
" <td>1yearincrements.AGE1_CY</td>\n",
" <td>F1yearincrements_AGE1_CY</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AGE2_CY</td>\n",
" <td>2020 Population Age 2</td>\n",
" <td>1yearincrements</td>\n",
" <td>1yearincrements.AGE2_CY</td>\n",
" <td>F1yearincrements_AGE2_CY</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AGE3_CY</td>\n",
" <td>2020 Population Age 3</td>\n",
" <td>1yearincrements</td>\n",
" <td>1yearincrements.AGE3_CY</td>\n",
" <td>F1yearincrements_AGE3_CY</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AGE4_CY</td>\n",
" <td>2020 Population Age 4</td>\n",
" <td>1yearincrements</td>\n",
" <td>1yearincrements.AGE4_CY</td>\n",
" <td>F1yearincrements_AGE4_CY</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name alias data_collection enrich_name \\\n",
"0 AGE0_CY 2020 Population Age <1 1yearincrements 1yearincrements.AGE0_CY \n",
"1 AGE1_CY 2020 Population Age 1 1yearincrements 1yearincrements.AGE1_CY \n",
"2 AGE2_CY 2020 Population Age 2 1yearincrements 1yearincrements.AGE2_CY \n",
"3 AGE3_CY 2020 Population Age 3 1yearincrements 1yearincrements.AGE3_CY \n",
"4 AGE4_CY 2020 Population Age 4 1yearincrements 1yearincrements.AGE4_CY \n",
"\n",
" enrich_field_name \n",
"0 F1yearincrements_AGE0_CY \n",
"1 F1yearincrements_AGE1_CY \n",
"2 F1yearincrements_AGE2_CY \n",
"3 F1yearincrements_AGE3_CY \n",
"4 F1yearincrements_AGE4_CY "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"evars = Country('USA', source='local').enrich_variables.drop_duplicates('name').reset_index(drop=True)\n",
"\n",
"print(evars.info())\n",
"evars.head()"
]
},
{
"cell_type": "markdown",
"id": "d3818b03-b3ec-4913-bdc7-005f186b340b",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"source": [
"### Get Training Data\n",
"\n",
"Although we are going to do a lot more with this later in the workflow in another notebook, right now we need the unique identifiers from this data to filter the block group demographic data to just the features we are interested in."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "a2b7d676-3251-4b4b-a345-79d671396a63",
"metadata": {
"slideshow": {
"slide_type": "subslide"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time: 170 ms\n"
]
}
],
"source": [
"%%time\n",
"# where the block groups are coming from with the field containing the identifiers\n",
"bg_pth = gdb_raw/'block_group_patterns'\n",
"id_col = 'fips'\n",
"\n",
"# read the identifiers into a dataframe WITHOUT the geometry - MUCH faster, and get an id list\n",
"bg_df = GeoAccessor.from_table(bg_pth, fields=[id_col])\n",
"id_lst = list(bg_df[id_col])\n",
"\n",
"# add delineators around the values\n",
"id_lst = [f\"'{v}'\" for v in id_lst]\n",
"\n",
"# combine all the values into a string separated by commas\n",
"id_str = ','.join(id_lst)\n",
"\n",
"# create the where clause\n",
"id_where_clause = f\"ID IN ({id_str})\""
]
},
{
"cell_type": "markdown",
"id": "af81dd54-f182-47f7-b42b-dbf2bcbfdc8e",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"source": [
"### Get Demographics\n",
"\n",
"Since the Enrich tool runs pretty slow, we are retrieving the data by simply crawling (or walking) down into the directory tree, and retrieving the data directly from the feature classes. Since we do not need any geometry, we leave off the geometries to simplify saving the data, which we save as Parquet files."
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "90ba08a5-fa89-41f8-9ef6-3f7cdcfa2798",
"metadata": {
"slideshow": {
"slide_type": "subslide"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1148 entries, 0 to 1147\n",
"Columns: 8251 entries, ID_x to RETBUS7225\n",
"dtypes: float64(1971), int64(6276), object(4)\n",
"memory usage: 72.3+ MB\n",
"Wall time: 2min 2s\n"
]
}
],
"source": [
"%%time\n",
"enrich_fields = list(evars.name)\n",
"existing_fields = []\n",
"enrich_df_lst = []\n",
"uninteresting_fld_lst = ['ObjectID', 'Shape_Length', 'Shape_Area', 'OBJECTID', 'Shape', 'RG_ABBREV', 'RG_NAME']\n",
"\n",
"# crawl down into the data for the BA data and get all the block group feature classes\n",
"for dir_top, _, obj_lst in arcpy.da.Walk(str(ba_dir), datatype='FeatureClass', type='Polygon'):\n",
" \n",
" # for every feature class in this parent directory\n",
" for obj in obj_lst:\n",
" \n",
" # if the feature class follows the naming convention to be a block group feature class\n",
" if 'BlockGroups_bg' in obj:\n",
" \n",
" # create a full path to the dataset\n",
" fc_pth = Path(dir_top)/obj\n",
" \n",
" # if the field is interesting, save it with the path to the parent\n",
" if len(enrich_fields) > 0:\n",
" tmp_flds = [f.name for f in arcpy.ListFields(str(fc_pth)) if f.name in enrich_fields]\n",
" else:\n",
" tmp_flds = [f.name for f in arcpy.ListFields(str(fc_pth)) if f.name not in uninteresting_fld_lst]\n",
" \n",
" # see if we have any new fields to retrieve\n",
" tmp_flds = [c for c in tmp_flds if c not in existing_fields]\n",
" \n",
" # a lot of the feature classes do not yeild any data we are interested in\n",
" if len(tmp_flds) > 0:\n",
" \n",
" # ensure the id field is included for joining\n",
" tmp_flds = ['ID'] + tmp_flds\n",
"\n",
" # use search cursor to get the data\n",
" tmp_df = pd.DataFrame([r for r in arcpy.da.SearchCursor(str(fc_pth), field_names=tmp_flds, where_clause=id_where_clause)], columns=tmp_flds)\n",
" \n",
" # cache the already retrieved columns\n",
" existing_fields = [c for c in tmp_df.columns if c != 'ID']\n",
" \n",
" # tack on the retrieved data\n",
" enrich_df_lst.append(tmp_df)\n",
" \n",
"# combine all the dataframes into a single dataframe for working with\n",
"enrich_df = reduce(lambda df1, df2: pd.merge(df1, df2, left_index=True, right_index=True), enrich_df_lst)\n",
"\n",
"# ensure no duplicated columns\n",
"enrich_df = enrich_df.loc[:,~enrich_df.columns.duplicated()]\n",
"\n",
"enrich_df.info()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "b9cda6b1-8109-40a1-b773-e10c672b1b50",
"metadata": {
"slideshow": {
"slide_type": "subslide"
},
"tags": []
},
"outputs": [],
"source": [
"enrich_pa = pa.Table.from_pandas(enrich_df)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "0e79f92a-0b8d-4228-8031-774b545b698d",
"metadata": {
"slideshow": {
"slide_type": "fragment"
},
"tags": []
},
"outputs": [],
"source": [
"pq.write_table(enrich_pa, dir_raw/'block_group_demographics.parquet', compression='None')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment