Created
August 12, 2019 22:07
-
-
Save kuanb/c37d748df6988810b3977e530eb1fc5c to your computer and use it in GitHub Desktop.
Example of how to clean out null, empty, and invalid geoms with just shapely in a notebook
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import geopandas as gpd\n", | |
"\n", | |
"# Note: I went to https://www.fws.gov/wetlands/Data/State-Downloads.html\n", | |
"# and downloaded data for all of California as a GeoDataBase\n", | |
"\n", | |
"# This data now sits locally at the following path:\n", | |
"path = 'hunter_help/wetlands.gdb'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['California',\n", | |
" 'CA_Riparian',\n", | |
" 'CA_Riparian_Project_Metadata',\n", | |
" 'CA_Wetlands_Project_Metadata',\n", | |
" 'CA_Wetlands_Historic_Map_Info',\n", | |
" 'CA_Wetlands']" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import fiona\n", | |
"layer_list = fiona.listlayers(path)\n", | |
"layer_list" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'CA_Wetlands'" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"wetlands_layer = layer_list[-1]\n", | |
"wetlands_layer" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>ATTRIBUTE</th>\n", | |
" <th>WETLAND_TYPE</th>\n", | |
" <th>ACRES</th>\n", | |
" <th>GLOBALID</th>\n", | |
" <th>Shape_Length</th>\n", | |
" <th>Shape_Area</th>\n", | |
" <th>geometry</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>L1ABHh</td>\n", | |
" <td>Lake</td>\n", | |
" <td>6.733421</td>\n", | |
" <td>{74D040F8-F10D-49BE-8EAA-32A8B4C124BD}</td>\n", | |
" <td>798.569800</td>\n", | |
" <td>27249.187261</td>\n", | |
" <td>(POLYGON ((-2041287.73 2382092.48, -2041139.59...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>L1ABHh</td>\n", | |
" <td>Lake</td>\n", | |
" <td>20.045324</td>\n", | |
" <td>{378DDF89-18E6-4701-AFCA-1F4F3582E328}</td>\n", | |
" <td>1553.398708</td>\n", | |
" <td>81120.546350</td>\n", | |
" <td>(POLYGON ((-2099088.98 2404971.49, -2099122.74...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>L1UBH</td>\n", | |
" <td>Lake</td>\n", | |
" <td>37.110679</td>\n", | |
" <td>{2A846F3F-7274-4F6D-8ECC-3BA31053CA1D}</td>\n", | |
" <td>2071.552911</td>\n", | |
" <td>150181.590852</td>\n", | |
" <td>(POLYGON ((-2090727.470000001 2409430.18, -209...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>L1UBH</td>\n", | |
" <td>Lake</td>\n", | |
" <td>5.525569</td>\n", | |
" <td>{35F06B1A-6C5F-4653-B84A-92434F80C797}</td>\n", | |
" <td>546.365061</td>\n", | |
" <td>22361.183363</td>\n", | |
" <td>(POLYGON ((-2213253.2116 2438348.738600001, -2...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>L1UBHh</td>\n", | |
" <td>Lake</td>\n", | |
" <td>8.785541</td>\n", | |
" <td>{1109680A-C0B9-4D10-AA5A-BBCBD2A78348}</td>\n", | |
" <td>2840.213796</td>\n", | |
" <td>35553.824662</td>\n", | |
" <td>(POLYGON ((-2098700.77 2407659.109999999, -209...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ATTRIBUTE WETLAND_TYPE ACRES GLOBALID \\\n", | |
"0 L1ABHh Lake 6.733421 {74D040F8-F10D-49BE-8EAA-32A8B4C124BD} \n", | |
"1 L1ABHh Lake 20.045324 {378DDF89-18E6-4701-AFCA-1F4F3582E328} \n", | |
"2 L1UBH Lake 37.110679 {2A846F3F-7274-4F6D-8ECC-3BA31053CA1D} \n", | |
"3 L1UBH Lake 5.525569 {35F06B1A-6C5F-4653-B84A-92434F80C797} \n", | |
"4 L1UBHh Lake 8.785541 {1109680A-C0B9-4D10-AA5A-BBCBD2A78348} \n", | |
"\n", | |
" Shape_Length Shape_Area \\\n", | |
"0 798.569800 27249.187261 \n", | |
"1 1553.398708 81120.546350 \n", | |
"2 2071.552911 150181.590852 \n", | |
"3 546.365061 22361.183363 \n", | |
"4 2840.213796 35553.824662 \n", | |
"\n", | |
" geometry \n", | |
"0 (POLYGON ((-2041287.73 2382092.48, -2041139.59... \n", | |
"1 (POLYGON ((-2099088.98 2404971.49, -2099122.74... \n", | |
"2 (POLYGON ((-2090727.470000001 2409430.18, -209... \n", | |
"3 (POLYGON ((-2213253.2116 2438348.738600001, -2... \n", | |
"4 (POLYGON ((-2098700.77 2407659.109999999, -209... " | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"wetlands_gdf = gpd.read_file(path, layer=wetlands_layer)\n", | |
"\n", | |
"wetlands_gdf.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"this many rows: 896544\n" | |
] | |
} | |
], | |
"source": [ | |
"# Note this is a pretty big dataset, about 811 MB zipped on disk, so large memory overhead\n", | |
"print('this many rows:', len(wetlands_gdf))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"number of coords in this dataset: 209523595\n", | |
"CPU times: user 47.1 s, sys: 30 ms, total: 47.1 s\n", | |
"Wall time: 47.2 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"\n", | |
"total_coord_ct = 0\n", | |
"\n", | |
"# Note: Looks like every layer is held as a multipolygon\n", | |
"for geom in wetlands_gdf.geometry:\n", | |
" for g in geom:\n", | |
" pt_list = [p for p in g.exterior.coords]\n", | |
" coord_ct = len(pt_list)\n", | |
" total_coord_ct += coord_ct\n", | |
" \n", | |
"print('number of coords in this dataset:', total_coord_ct)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"number of pure polygons in this dataset: 897100\n" | |
] | |
} | |
], | |
"source": [ | |
"# Performing a unary union is flakey in geopandas, this\n", | |
"# is an overkill solution that is slow but more stable\n", | |
"all_geoms = []\n", | |
"for geom in wetlands_gdf.geometry:\n", | |
" for g in geom:\n", | |
" all_geoms.append(g)\n", | |
" \n", | |
"print('number of pure polygons in this dataset:', len(all_geoms))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 2min 49s, sys: 90 ms, total: 2min 49s\n", | |
"Wall time: 2min 49s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"\n", | |
"# Clean out all geometries that are either invalid or empty\n", | |
"cleaned_gs = []\n", | |
"for g in all_geoms:\n", | |
" v = g.is_valid\n", | |
" e = g.is_empty\n", | |
" \n", | |
" # Drop the \n", | |
" if v and not e:\n", | |
" cleaned_gs.append(g)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"number of cleaned geoms: 896807\n", | |
"number of geoms that were pruned: 293\n" | |
] | |
} | |
], | |
"source": [ | |
"print('number of cleaned geoms:', len(cleaned_gs))\n", | |
"print('number of geoms that were pruned:', len(all_geoms) - len(cleaned_gs))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from shapely.geometry import MultiPolygon\n", | |
"\n", | |
"# No need to perform unary union since this data is not overlapping\n", | |
"mp = MultiPolygon(cleaned_gs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 22 s, sys: 4.18 s, total: 26.2 s\n", | |
"Wall time: 26.2 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"\n", | |
"# But we could if we wanted to...\n", | |
"mp_uu = MultiPolygon(cleaned_gs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment