Skip to content

Instantly share code, notes, and snippets.

@lnorton
Last active February 17, 2022 00:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lnorton/4715b624f20544d04b225012b28ab1bd to your computer and use it in GitHub Desktop.
Save lnorton/4715b624f20544d04b225012b28ab1bd to your computer and use it in GitHub Desktop.
Remove polygons by area using GeoPandas
# MIT License
#
# Copyright (c) 2022 Len Norton
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import geopandas
from shapely.geometry import Polygon
def cursed_remove_small_regions(in_vector_filename, out_vector_filename, minimum_area):
# Do not use. This function is an abomination. It takes an hour to process a file that QGIS can do in seconds.
# Need to use an equal area projection for area calculations.
# North America Albers Equal Area Conic (ESRI 102008) fits the bill for my data.
regions = geopandas.read_file(in_vector_filename).to_crs('ESRI:102008')
# Must explode the multipolygons to get individual area values.
regions_exploded = regions.explode()
# Lambert Cylindrical Equal Area (EPSG 9834) was generating infinities near the pole.
expect_bad_data = False
if expect_bad_data and not regions_exploded.is_valid.all():
regions_exploded = regions_exploded.loc[regions_exploded['geometry'].is_valid, :]
# All polygons start as their own aggregator, which is necessary for them to be retained in the dissolve. The
# aggregator-aggregated relationship is such that all polygons to be removed have an adjacent aggregator polygon
# that will consume them, and that polygon is denoted by the 'gator_index'. But since that polygon may be also
# be consumed by another polygon, the 'gator_index' is updated to be the polygon that ultimately consumes that
# entire chain of adjacent polygons small enough to be removed.
regions_exploded.loc[:, 'gator_index'] = regions_exploded.index
# Need to calculate the area on the exterior, because interior polygons don't overlap, they take up area.
regions_exploded.loc[:, 'aggregate_area'] = regions_exploded.exterior.map(lambda p: Polygon(p).area)
# Select all the polygons that are too small.
selected_regions = regions_exploded.loc[regions_exploded['aggregate_area'] < minimum_area]
# Inner join all polygons with those selected for removal. It will create a row for every 'touches' relationship
# between a polygon and a polygon slated for removal, so many polygons will occur multiple times in the result,
# but only one of those each touches will consume them.
selected_regions_joined = geopandas.sjoin(regions_exploded, selected_regions, op='touches')
# Sort by the area of the right side, as this represents the polygons to be removed. Start processing by the
# largest first, as that guarantees that each polygon selected for removal will have its 'gator_index' updated to
# the final value before any of its prey are processed, since there are no larger polygons in the removal list
# that might consume it. The polygons it consumes will receive that 'gator_index' rather than its own index, as
# will the polygons they consume, and in this way the 'gator_index' is passed down the consumption chain.
selected_regions_joined.sort_values(by='aggregate_area_right', ascending=False, inplace=True)
for left_index, row in selected_regions_joined.iterrows():
right_index = (row['index_right0'], row['index_right1'])
right_aggregate_area = regions_exploded.at[right_index, 'aggregate_area']
left_aggregate_area = regions_exploded.at[left_index, 'aggregate_area']
# The relationships expressed by the join go both ways, so only update the 'gator_index' when the
# right side is paired with a larger left side polygon, that will be one that may consume it.
if right_aggregate_area < left_aggregate_area:
only_single_parents = True
if only_single_parents:
regions_exploded.at[right_index, 'gator_index'] = regions_exploded.at[left_index, 'gator_index']
else:
# My dataset is contour rings, but this function may work in the general case.
# Follow the rule of always merging with the largest touching polygon.
right_gator_index = row['gator_index_right']
gator_aggregate_area = regions_exploded.at[right_gator_index, 'aggregate_area']
if gator_aggregate_area < left_aggregate_area:
regions_exploded.at[right_index, 'gator_index'] = regions_exploded.at[left_index, 'gator_index']
# The dissolve 'aggfunc' callback provides frustratingly little context. This is what I came up with to always
# select the value from the column array that corresponds to the row that has the same index as 'gator_index'.
kludge = lambda a: next(v for i, v in a.items() if regions_exploded.at[i, 'gator_index'] == i)
# The dissolve function groups all the rows by the shared values in the 'gator_index' column, merging their
# geometry in a single row, and the 'aggfunc' callback selects the values for non-geometry columns.
regions_exploded_filtered = regions_exploded.dissolve(by='gator_index', as_index=False, aggfunc=kludge)
# Get rid of the temporary columns. The save routine chokes on raw tuples as field values anyway.
regions_exploded_filtered.drop('gator_index', inplace=True, axis=1)
regions_exploded_filtered.drop('aggregate_area', inplace=True, axis=1)
regions_exploded_filtered.to_crs(epsg=4326).to_file(out_vector_filename, driver='GPKG')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment