Created
December 12, 2017 19:56
-
-
Save jisantuc/69fffdfe3284dceab3941f7a70f0c28c to your computer and use it in GitHub Desktop.
Simplification script for Census block shapefile from VA
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from concurrent.futures import ThreadPoolExecutor, as_completed | |
import pickle | |
import time | |
from uuid import uuid4 | |
import fiona | |
from rtree import index | |
from shapely import geometry | |
from shapely.geometry import shape | |
from shapely.ops import cascaded_union | |
def async_runner(big_shp, filter_fut): | |
"""Simplify records from filter_fut (a future) into larger shapes""" | |
start = time.time() | |
filtered = filter_fut.result() | |
adj = records_to_adjacency(filtered) | |
# This seems like a good number of simplifications /shrug | |
simplified_4x = simplify_adjacency_dict( | |
simplify_adjacency_dict( | |
simplify_adjacency_dict( | |
simplify_adjacency_dict(adj) | |
) | |
) | |
) | |
print('Time to complete {}: {}'.format(big_shp['id'], time.time() - start)) | |
return simplified_4x | |
def filter_blocks_to_enclosure(enclosure, index): | |
"""Return blocks within a shape""" | |
candidates = [x.object for x in index.intersection(enclosure['geometry'].bounds, objects=True)] | |
return [ | |
x for x in candidates if x['geometry'].intersection(enclosure['geometry']) == x['geometry'] | |
] | |
def get_block_records(index): | |
"""Read the blocks shapefile""" | |
# for testing on a smaller file | |
# return read_records('merged_shrunk.shp', index) | |
return read_records('tl_2010_51_block10_3785_join.shp', index) | |
def get_vtd_records(): | |
"""Read the shapefile for the geolevel above blocks""" | |
return read_records('tl_2010_51_vtd10_3785.shp') | |
def read_records(path, index=None): | |
"""Read a shapefile, optionally writing each shape within it to a spatial index | |
Writing to an index considerably slows down the initial read, but makes filtering | |
later _way_ faster | |
""" | |
with fiona.open(path) as reader: | |
records = [rec for rec in reader] | |
n_records = len(records) | |
for i, record in enumerate(records): | |
if i % 1000 == 0: | |
print('{} / {}'.format(i, n_records)) | |
record['geometry'] = shape(record['geometry']) | |
record['id'] = str(uuid4()) | |
if index: | |
index.insert(i, record['geometry'].bounds, obj=record) | |
return records | |
def records_to_adjacency(records): | |
"""Transform a list of records into a mapping of record IDs to neighbors""" | |
return { | |
shp['id']: { | |
'record': shp, | |
'neighbors': [x for x in records if x['geometry'].touches(shp['geometry'])] | |
} for shp in records | |
} | |
def remove_extra_fields(record): | |
"""Return a dictionary without anything that doesn't belong in a geojson""" | |
return { | |
k: v for k, v in record.items() if k in ['geometry', 'properties', 'type'] | |
} | |
def simplify_adjacency_dict(adjacency_dict, threshold=10): | |
"""Merge records with their neighbors | |
The form of the adjacency dict is assumed to be the same as that returned by | |
records_to_adjacency | |
""" | |
if len(adjacency_dict) <= threshold: | |
return adjacency_dict | |
visited = [] | |
new_records = [] | |
for shp_id in adjacency_dict: | |
if shp_id in visited: | |
continue | |
node = adjacency_dict[shp_id] | |
node['record']['geometry'] = cascaded_union( | |
[node['record']['geometry']] + [x['geometry'] for x in node['neighbors']] | |
) | |
visited.extend([x['id'] for x in node['neighbors']]) | |
new_records.append(node['record']) | |
return records_to_adjacency(new_records) | |
def write_merged_records(in_path, out_path, records): | |
"""Write a collection of records to out_path with in_path's schema""" | |
with fiona.open(in_path) as src: | |
new_schema = src.schema.copy() | |
with fiona.open( | |
out_path, 'w', crs=src.crs, driver=src.driver, schema=new_schema | |
) as dst: | |
for rec in records: | |
rec['geometry'] = geometry.mapping(rec['geometry']) | |
dst.write(rec) | |
def main(): | |
idx = index.Index() | |
print('Getting mid-sized records...') | |
upper = get_vtd_records() | |
# dump the larger geometries, in case you observe that one of them took a really | |
# long time and you want to make sure you can come up with an explanation. For | |
# example, if it's a long weird skinny shape, it might have a pretty large | |
# bounding box, which would make the filter take longer. | |
# Some of them can take a while -- in testing on 20 random shapes, I observed | |
# run times ranging from 0.99 seconds to over 5 minutes. | |
with open('bigGeoms.pkl', 'wb') as outf: | |
pickle.dump(upper, outf) | |
print('Getting Census blocks (note: slow, don\'t be alarmed)...') | |
get_block_records(idx) | |
print('Merging Census blocks within larger units...') | |
futures = [] | |
with ThreadPoolExecutor(max_workers=6) as pool: | |
# run small shapes first because they fly by | |
for big_shp in sorted(upper, key=lambda x: x['geometry'].area): | |
filter_fut = pool.submit(filter_blocks_to_enclosure, big_shp, idx) | |
futures.append(pool.submit(async_runner, big_shp, filter_fut)) | |
res = [f.result() for f in futures] | |
merged_records = [remove_extra_fields(y['record']) | |
for x in res for y in x.values()] | |
out_path = 'merged_shrunk.shp' | |
write_merged_records('tl_2010_51_block10_3785_join.shp', out_path, merged_records) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment