lnorton/cursed_remove_small_regions.py

## cursed_remove_small_regions.py
# MIT License
#
# Copyright (c) 2022 Len Norton
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


import geopandas
from shapely.geometry import Polygon


def cursed_remove_small_regions(in_vector_filename, out_vector_filename, minimum_area):
    # Do not use. This function is an abomination. It takes an hour to process a file that QGIS can do in seconds.

    # Need to use an equal area projection for area calculations.
    # North America Albers Equal Area Conic (ESRI 102008) fits the bill for my data.
    regions = geopandas.read_file(in_vector_filename).to_crs('ESRI:102008')

    # Must explode the multipolygons to get individual area values.
    regions_exploded = regions.explode()

    # Lambert Cylindrical Equal Area (EPSG 9834) was generating infinities near the pole.
    expect_bad_data = False
    if expect_bad_data and not regions_exploded.is_valid.all():
        regions_exploded = regions_exploded.loc[regions_exploded['geometry'].is_valid, :]

    # All polygons start as their own aggregator, which is necessary for them to be retained in the dissolve. The
    # aggregator-aggregated relationship is such that all polygons to be removed have an adjacent aggregator polygon
    # that will consume them, and that polygon is denoted by the 'gator_index'. But since that polygon may be also
    # be consumed by another polygon, the 'gator_index' is updated to be the polygon that ultimately consumes that
    # entire chain of adjacent polygons small enough to be removed.
    regions_exploded.loc[:, 'gator_index'] = regions_exploded.index

    # Need to calculate the area on the exterior, because interior polygons don't overlap, they take up area.
    regions_exploded.loc[:, 'aggregate_area'] = regions_exploded.exterior.map(lambda p: Polygon(p).area)

    # Select all the polygons that are too small.
    selected_regions = regions_exploded.loc[regions_exploded['aggregate_area'] < minimum_area]

    # Inner join all polygons with those selected for removal. It will create a row for every 'touches' relationship
    # between a polygon and a polygon slated for removal, so many polygons will occur multiple times in the result,
    # but only one of those each touches will consume them.
    selected_regions_joined = geopandas.sjoin(regions_exploded, selected_regions, op='touches')

    # Sort by the area of the right side, as this represents the polygons to be removed. Start processing by the
    # largest first, as that guarantees that each polygon selected for removal will have its 'gator_index' updated to
    # the final value before any of its prey are processed, since there are no larger polygons in the removal list
    # that might consume it. The polygons it consumes will receive that 'gator_index' rather than its own index, as
    # will the polygons they consume, and in this way the 'gator_index' is passed down the consumption chain.
    selected_regions_joined.sort_values(by='aggregate_area_right', ascending=False, inplace=True)

    for left_index, row in selected_regions_joined.iterrows():
        right_index = (row['index_right0'], row['index_right1'])
        right_aggregate_area = regions_exploded.at[right_index, 'aggregate_area']
        left_aggregate_area = regions_exploded.at[left_index, 'aggregate_area']

        # The relationships expressed by the join go both ways, so only update the 'gator_index' when the
        # right side is paired with a larger left side polygon, that will be one that may consume it.
        if right_aggregate_area < left_aggregate_area:
            only_single_parents = True
            if only_single_parents:
                regions_exploded.at[right_index, 'gator_index'] = regions_exploded.at[left_index, 'gator_index']
            else:
                # My dataset is contour rings, but this function may work in the general case.
                # Follow the rule of always merging with the largest touching polygon.
                right_gator_index = row['gator_index_right']
                gator_aggregate_area = regions_exploded.at[right_gator_index, 'aggregate_area']
                if gator_aggregate_area < left_aggregate_area:
                    regions_exploded.at[right_index, 'gator_index'] = regions_exploded.at[left_index, 'gator_index']

    # The dissolve 'aggfunc' callback provides frustratingly little context. This is what I came up with to always
    # select the value from the column array that corresponds to the row that has the same index as 'gator_index'.
    kludge = lambda a: next(v for i, v in a.items() if regions_exploded.at[i, 'gator_index'] == i)

    # The dissolve function groups all the rows by the shared values in the 'gator_index' column, merging their
    # geometry in a single row, and the 'aggfunc' callback selects the values for non-geometry columns.
    regions_exploded_filtered = regions_exploded.dissolve(by='gator_index', as_index=False, aggfunc=kludge)

    # Get rid of the temporary columns. The save routine chokes on raw tuples as field values anyway.
    regions_exploded_filtered.drop('gator_index', inplace=True, axis=1)
    regions_exploded_filtered.drop('aggregate_area', inplace=True, axis=1)
    regions_exploded_filtered.to_crs(epsg=4326).to_file(out_vector_filename, driver='GPKG')
	# MIT License
	#
	# Copyright (c) 2022 Len Norton
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.


	import geopandas
	from shapely.geometry import Polygon


	def cursed_remove_small_regions(in_vector_filename, out_vector_filename, minimum_area):
	# Do not use. This function is an abomination. It takes an hour to process a file that QGIS can do in seconds.

	# Need to use an equal area projection for area calculations.
	# North America Albers Equal Area Conic (ESRI 102008) fits the bill for my data.
	regions = geopandas.read_file(in_vector_filename).to_crs('ESRI:102008')

	# Must explode the multipolygons to get individual area values.
	regions_exploded = regions.explode()

	# Lambert Cylindrical Equal Area (EPSG 9834) was generating infinities near the pole.
	expect_bad_data = False
	if expect_bad_data and not regions_exploded.is_valid.all():
	regions_exploded = regions_exploded.loc[regions_exploded['geometry'].is_valid, :]

	# All polygons start as their own aggregator, which is necessary for them to be retained in the dissolve. The
	# aggregator-aggregated relationship is such that all polygons to be removed have an adjacent aggregator polygon
	# that will consume them, and that polygon is denoted by the 'gator_index'. But since that polygon may be also
	# be consumed by another polygon, the 'gator_index' is updated to be the polygon that ultimately consumes that
	# entire chain of adjacent polygons small enough to be removed.
	regions_exploded.loc[:, 'gator_index'] = regions_exploded.index

	# Need to calculate the area on the exterior, because interior polygons don't overlap, they take up area.
	regions_exploded.loc[:, 'aggregate_area'] = regions_exploded.exterior.map(lambda p: Polygon(p).area)

	# Select all the polygons that are too small.
	selected_regions = regions_exploded.loc[regions_exploded['aggregate_area'] < minimum_area]

	# Inner join all polygons with those selected for removal. It will create a row for every 'touches' relationship
	# between a polygon and a polygon slated for removal, so many polygons will occur multiple times in the result,
	# but only one of those each touches will consume them.
	selected_regions_joined = geopandas.sjoin(regions_exploded, selected_regions, op='touches')

	# Sort by the area of the right side, as this represents the polygons to be removed. Start processing by the
	# largest first, as that guarantees that each polygon selected for removal will have its 'gator_index' updated to
	# the final value before any of its prey are processed, since there are no larger polygons in the removal list
	# that might consume it. The polygons it consumes will receive that 'gator_index' rather than its own index, as
	# will the polygons they consume, and in this way the 'gator_index' is passed down the consumption chain.
	selected_regions_joined.sort_values(by='aggregate_area_right', ascending=False, inplace=True)

	for left_index, row in selected_regions_joined.iterrows():
	right_index = (row['index_right0'], row['index_right1'])
	right_aggregate_area = regions_exploded.at[right_index, 'aggregate_area']
	left_aggregate_area = regions_exploded.at[left_index, 'aggregate_area']

	# The relationships expressed by the join go both ways, so only update the 'gator_index' when the
	# right side is paired with a larger left side polygon, that will be one that may consume it.
	if right_aggregate_area < left_aggregate_area:
	only_single_parents = True
	if only_single_parents:
	regions_exploded.at[right_index, 'gator_index'] = regions_exploded.at[left_index, 'gator_index']
	else:
	# My dataset is contour rings, but this function may work in the general case.
	# Follow the rule of always merging with the largest touching polygon.
	right_gator_index = row['gator_index_right']
	gator_aggregate_area = regions_exploded.at[right_gator_index, 'aggregate_area']
	if gator_aggregate_area < left_aggregate_area:
	regions_exploded.at[right_index, 'gator_index'] = regions_exploded.at[left_index, 'gator_index']

	# The dissolve 'aggfunc' callback provides frustratingly little context. This is what I came up with to always
	# select the value from the column array that corresponds to the row that has the same index as 'gator_index'.
	kludge = lambda a: next(v for i, v in a.items() if regions_exploded.at[i, 'gator_index'] == i)

	# The dissolve function groups all the rows by the shared values in the 'gator_index' column, merging their
	# geometry in a single row, and the 'aggfunc' callback selects the values for non-geometry columns.
	regions_exploded_filtered = regions_exploded.dissolve(by='gator_index', as_index=False, aggfunc=kludge)

	# Get rid of the temporary columns. The save routine chokes on raw tuples as field values anyway.
	regions_exploded_filtered.drop('gator_index', inplace=True, axis=1)
	regions_exploded_filtered.drop('aggregate_area', inplace=True, axis=1)
	regions_exploded_filtered.to_crs(epsg=4326).to_file(out_vector_filename, driver='GPKG')