Skip to content

Instantly share code, notes, and snippets.

@kuanb
Created August 12, 2019 22:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kuanb/c37d748df6988810b3977e530eb1fc5c to your computer and use it in GitHub Desktop.
Save kuanb/c37d748df6988810b3977e530eb1fc5c to your computer and use it in GitHub Desktop.
Example of how to clean out null, empty, and invalid geoms with just shapely in a notebook
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import geopandas as gpd\n",
"\n",
"# Note: I went to https://www.fws.gov/wetlands/Data/State-Downloads.html\n",
"# and downloaded data for all of California as a GeoDataBase\n",
"\n",
"# This data now sits locally at the following path:\n",
"path = 'hunter_help/wetlands.gdb'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['California',\n",
" 'CA_Riparian',\n",
" 'CA_Riparian_Project_Metadata',\n",
" 'CA_Wetlands_Project_Metadata',\n",
" 'CA_Wetlands_Historic_Map_Info',\n",
" 'CA_Wetlands']"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import fiona\n",
"layer_list = fiona.listlayers(path)\n",
"layer_list"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'CA_Wetlands'"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wetlands_layer = layer_list[-1]\n",
"wetlands_layer"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ATTRIBUTE</th>\n",
" <th>WETLAND_TYPE</th>\n",
" <th>ACRES</th>\n",
" <th>GLOBALID</th>\n",
" <th>Shape_Length</th>\n",
" <th>Shape_Area</th>\n",
" <th>geometry</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>L1ABHh</td>\n",
" <td>Lake</td>\n",
" <td>6.733421</td>\n",
" <td>{74D040F8-F10D-49BE-8EAA-32A8B4C124BD}</td>\n",
" <td>798.569800</td>\n",
" <td>27249.187261</td>\n",
" <td>(POLYGON ((-2041287.73 2382092.48, -2041139.59...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>L1ABHh</td>\n",
" <td>Lake</td>\n",
" <td>20.045324</td>\n",
" <td>{378DDF89-18E6-4701-AFCA-1F4F3582E328}</td>\n",
" <td>1553.398708</td>\n",
" <td>81120.546350</td>\n",
" <td>(POLYGON ((-2099088.98 2404971.49, -2099122.74...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>L1UBH</td>\n",
" <td>Lake</td>\n",
" <td>37.110679</td>\n",
" <td>{2A846F3F-7274-4F6D-8ECC-3BA31053CA1D}</td>\n",
" <td>2071.552911</td>\n",
" <td>150181.590852</td>\n",
" <td>(POLYGON ((-2090727.470000001 2409430.18, -209...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>L1UBH</td>\n",
" <td>Lake</td>\n",
" <td>5.525569</td>\n",
" <td>{35F06B1A-6C5F-4653-B84A-92434F80C797}</td>\n",
" <td>546.365061</td>\n",
" <td>22361.183363</td>\n",
" <td>(POLYGON ((-2213253.2116 2438348.738600001, -2...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>L1UBHh</td>\n",
" <td>Lake</td>\n",
" <td>8.785541</td>\n",
" <td>{1109680A-C0B9-4D10-AA5A-BBCBD2A78348}</td>\n",
" <td>2840.213796</td>\n",
" <td>35553.824662</td>\n",
" <td>(POLYGON ((-2098700.77 2407659.109999999, -209...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ATTRIBUTE WETLAND_TYPE ACRES GLOBALID \\\n",
"0 L1ABHh Lake 6.733421 {74D040F8-F10D-49BE-8EAA-32A8B4C124BD} \n",
"1 L1ABHh Lake 20.045324 {378DDF89-18E6-4701-AFCA-1F4F3582E328} \n",
"2 L1UBH Lake 37.110679 {2A846F3F-7274-4F6D-8ECC-3BA31053CA1D} \n",
"3 L1UBH Lake 5.525569 {35F06B1A-6C5F-4653-B84A-92434F80C797} \n",
"4 L1UBHh Lake 8.785541 {1109680A-C0B9-4D10-AA5A-BBCBD2A78348} \n",
"\n",
" Shape_Length Shape_Area \\\n",
"0 798.569800 27249.187261 \n",
"1 1553.398708 81120.546350 \n",
"2 2071.552911 150181.590852 \n",
"3 546.365061 22361.183363 \n",
"4 2840.213796 35553.824662 \n",
"\n",
" geometry \n",
"0 (POLYGON ((-2041287.73 2382092.48, -2041139.59... \n",
"1 (POLYGON ((-2099088.98 2404971.49, -2099122.74... \n",
"2 (POLYGON ((-2090727.470000001 2409430.18, -209... \n",
"3 (POLYGON ((-2213253.2116 2438348.738600001, -2... \n",
"4 (POLYGON ((-2098700.77 2407659.109999999, -209... "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wetlands_gdf = gpd.read_file(path, layer=wetlands_layer)\n",
"\n",
"wetlands_gdf.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"this many rows: 896544\n"
]
}
],
"source": [
"# Note this is a pretty big dataset, about 811 MB zipped on disk, so large memory overhead\n",
"print('this many rows:', len(wetlands_gdf))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"number of coords in this dataset: 209523595\n",
"CPU times: user 47.1 s, sys: 30 ms, total: 47.1 s\n",
"Wall time: 47.2 s\n"
]
}
],
"source": [
"%%time\n",
"\n",
"total_coord_ct = 0\n",
"\n",
"# Note: Looks like every layer is held as a multipolygon\n",
"for geom in wetlands_gdf.geometry:\n",
" for g in geom:\n",
" pt_list = [p for p in g.exterior.coords]\n",
" coord_ct = len(pt_list)\n",
" total_coord_ct += coord_ct\n",
" \n",
"print('number of coords in this dataset:', total_coord_ct)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"number of pure polygons in this dataset: 897100\n"
]
}
],
"source": [
"# Performing a unary union is flakey in geopandas, this\n",
"# is an overkill solution that is slow but more stable\n",
"all_geoms = []\n",
"for geom in wetlands_gdf.geometry:\n",
" for g in geom:\n",
" all_geoms.append(g)\n",
" \n",
"print('number of pure polygons in this dataset:', len(all_geoms))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2min 49s, sys: 90 ms, total: 2min 49s\n",
"Wall time: 2min 49s\n"
]
}
],
"source": [
"%%time\n",
"\n",
"# Clean out all geometries that are either invalid or empty\n",
"cleaned_gs = []\n",
"for g in all_geoms:\n",
" v = g.is_valid\n",
" e = g.is_empty\n",
" \n",
" # Drop the \n",
" if v and not e:\n",
" cleaned_gs.append(g)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"number of cleaned geoms: 896807\n",
"number of geoms that were pruned: 293\n"
]
}
],
"source": [
"print('number of cleaned geoms:', len(cleaned_gs))\n",
"print('number of geoms that were pruned:', len(all_geoms) - len(cleaned_gs))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"from shapely.geometry import MultiPolygon\n",
"\n",
"# No need to perform unary union since this data is not overlapping\n",
"mp = MultiPolygon(cleaned_gs)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 22 s, sys: 4.18 s, total: 26.2 s\n",
"Wall time: 26.2 s\n"
]
}
],
"source": [
"%%time\n",
"\n",
"# But we could if we wanted to...\n",
"mp_uu = MultiPolygon(cleaned_gs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment