Last active
February 17, 2022 00:04
-
-
Save lnorton/4715b624f20544d04b225012b28ab1bd to your computer and use it in GitHub Desktop.
Remove polygons by area using GeoPandas
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# MIT License | |
# | |
# Copyright (c) 2022 Len Norton | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in all | |
# copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
# SOFTWARE. | |
import geopandas | |
from shapely.geometry import Polygon | |
def cursed_remove_small_regions(in_vector_filename, out_vector_filename, minimum_area): | |
# Do not use. This function is an abomination. It takes an hour to process a file that QGIS can do in seconds. | |
# Need to use an equal area projection for area calculations. | |
# North America Albers Equal Area Conic (ESRI 102008) fits the bill for my data. | |
regions = geopandas.read_file(in_vector_filename).to_crs('ESRI:102008') | |
# Must explode the multipolygons to get individual area values. | |
regions_exploded = regions.explode() | |
# Lambert Cylindrical Equal Area (EPSG 9834) was generating infinities near the pole. | |
expect_bad_data = False | |
if expect_bad_data and not regions_exploded.is_valid.all(): | |
regions_exploded = regions_exploded.loc[regions_exploded['geometry'].is_valid, :] | |
# All polygons start as their own aggregator, which is necessary for them to be retained in the dissolve. The | |
# aggregator-aggregated relationship is such that all polygons to be removed have an adjacent aggregator polygon | |
# that will consume them, and that polygon is denoted by the 'gator_index'. But since that polygon may be also | |
# be consumed by another polygon, the 'gator_index' is updated to be the polygon that ultimately consumes that | |
# entire chain of adjacent polygons small enough to be removed. | |
regions_exploded.loc[:, 'gator_index'] = regions_exploded.index | |
# Need to calculate the area on the exterior, because interior polygons don't overlap, they take up area. | |
regions_exploded.loc[:, 'aggregate_area'] = regions_exploded.exterior.map(lambda p: Polygon(p).area) | |
# Select all the polygons that are too small. | |
selected_regions = regions_exploded.loc[regions_exploded['aggregate_area'] < minimum_area] | |
# Inner join all polygons with those selected for removal. It will create a row for every 'touches' relationship | |
# between a polygon and a polygon slated for removal, so many polygons will occur multiple times in the result, | |
# but only one of those each touches will consume them. | |
selected_regions_joined = geopandas.sjoin(regions_exploded, selected_regions, op='touches') | |
# Sort by the area of the right side, as this represents the polygons to be removed. Start processing by the | |
# largest first, as that guarantees that each polygon selected for removal will have its 'gator_index' updated to | |
# the final value before any of its prey are processed, since there are no larger polygons in the removal list | |
# that might consume it. The polygons it consumes will receive that 'gator_index' rather than its own index, as | |
# will the polygons they consume, and in this way the 'gator_index' is passed down the consumption chain. | |
selected_regions_joined.sort_values(by='aggregate_area_right', ascending=False, inplace=True) | |
for left_index, row in selected_regions_joined.iterrows(): | |
right_index = (row['index_right0'], row['index_right1']) | |
right_aggregate_area = regions_exploded.at[right_index, 'aggregate_area'] | |
left_aggregate_area = regions_exploded.at[left_index, 'aggregate_area'] | |
# The relationships expressed by the join go both ways, so only update the 'gator_index' when the | |
# right side is paired with a larger left side polygon, that will be one that may consume it. | |
if right_aggregate_area < left_aggregate_area: | |
only_single_parents = True | |
if only_single_parents: | |
regions_exploded.at[right_index, 'gator_index'] = regions_exploded.at[left_index, 'gator_index'] | |
else: | |
# My dataset is contour rings, but this function may work in the general case. | |
# Follow the rule of always merging with the largest touching polygon. | |
right_gator_index = row['gator_index_right'] | |
gator_aggregate_area = regions_exploded.at[right_gator_index, 'aggregate_area'] | |
if gator_aggregate_area < left_aggregate_area: | |
regions_exploded.at[right_index, 'gator_index'] = regions_exploded.at[left_index, 'gator_index'] | |
# The dissolve 'aggfunc' callback provides frustratingly little context. This is what I came up with to always | |
# select the value from the column array that corresponds to the row that has the same index as 'gator_index'. | |
kludge = lambda a: next(v for i, v in a.items() if regions_exploded.at[i, 'gator_index'] == i) | |
# The dissolve function groups all the rows by the shared values in the 'gator_index' column, merging their | |
# geometry in a single row, and the 'aggfunc' callback selects the values for non-geometry columns. | |
regions_exploded_filtered = regions_exploded.dissolve(by='gator_index', as_index=False, aggfunc=kludge) | |
# Get rid of the temporary columns. The save routine chokes on raw tuples as field values anyway. | |
regions_exploded_filtered.drop('gator_index', inplace=True, axis=1) | |
regions_exploded_filtered.drop('aggregate_area', inplace=True, axis=1) | |
regions_exploded_filtered.to_crs(epsg=4326).to_file(out_vector_filename, driver='GPKG') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment