Skip to content

Instantly share code, notes, and snippets.

@jisantuc
Created December 12, 2017 19:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jisantuc/69fffdfe3284dceab3941f7a70f0c28c to your computer and use it in GitHub Desktop.
Save jisantuc/69fffdfe3284dceab3941f7a70f0c28c to your computer and use it in GitHub Desktop.
Simplification script for Census block shapefile from VA
from concurrent.futures import ThreadPoolExecutor, as_completed
import pickle
import time
from uuid import uuid4
import fiona
from rtree import index
from shapely import geometry
from shapely.geometry import shape
from shapely.ops import cascaded_union
def async_runner(big_shp, filter_fut):
"""Simplify records from filter_fut (a future) into larger shapes"""
start = time.time()
filtered = filter_fut.result()
adj = records_to_adjacency(filtered)
# This seems like a good number of simplifications /shrug
simplified_4x = simplify_adjacency_dict(
simplify_adjacency_dict(
simplify_adjacency_dict(
simplify_adjacency_dict(adj)
)
)
)
print('Time to complete {}: {}'.format(big_shp['id'], time.time() - start))
return simplified_4x
def filter_blocks_to_enclosure(enclosure, index):
"""Return blocks within a shape"""
candidates = [x.object for x in index.intersection(enclosure['geometry'].bounds, objects=True)]
return [
x for x in candidates if x['geometry'].intersection(enclosure['geometry']) == x['geometry']
]
def get_block_records(index):
"""Read the blocks shapefile"""
# for testing on a smaller file
# return read_records('merged_shrunk.shp', index)
return read_records('tl_2010_51_block10_3785_join.shp', index)
def get_vtd_records():
"""Read the shapefile for the geolevel above blocks"""
return read_records('tl_2010_51_vtd10_3785.shp')
def read_records(path, index=None):
"""Read a shapefile, optionally writing each shape within it to a spatial index
Writing to an index considerably slows down the initial read, but makes filtering
later _way_ faster
"""
with fiona.open(path) as reader:
records = [rec for rec in reader]
n_records = len(records)
for i, record in enumerate(records):
if i % 1000 == 0:
print('{} / {}'.format(i, n_records))
record['geometry'] = shape(record['geometry'])
record['id'] = str(uuid4())
if index:
index.insert(i, record['geometry'].bounds, obj=record)
return records
def records_to_adjacency(records):
"""Transform a list of records into a mapping of record IDs to neighbors"""
return {
shp['id']: {
'record': shp,
'neighbors': [x for x in records if x['geometry'].touches(shp['geometry'])]
} for shp in records
}
def remove_extra_fields(record):
"""Return a dictionary without anything that doesn't belong in a geojson"""
return {
k: v for k, v in record.items() if k in ['geometry', 'properties', 'type']
}
def simplify_adjacency_dict(adjacency_dict, threshold=10):
"""Merge records with their neighbors
The form of the adjacency dict is assumed to be the same as that returned by
records_to_adjacency
"""
if len(adjacency_dict) <= threshold:
return adjacency_dict
visited = []
new_records = []
for shp_id in adjacency_dict:
if shp_id in visited:
continue
node = adjacency_dict[shp_id]
node['record']['geometry'] = cascaded_union(
[node['record']['geometry']] + [x['geometry'] for x in node['neighbors']]
)
visited.extend([x['id'] for x in node['neighbors']])
new_records.append(node['record'])
return records_to_adjacency(new_records)
def write_merged_records(in_path, out_path, records):
"""Write a collection of records to out_path with in_path's schema"""
with fiona.open(in_path) as src:
new_schema = src.schema.copy()
with fiona.open(
out_path, 'w', crs=src.crs, driver=src.driver, schema=new_schema
) as dst:
for rec in records:
rec['geometry'] = geometry.mapping(rec['geometry'])
dst.write(rec)
def main():
idx = index.Index()
print('Getting mid-sized records...')
upper = get_vtd_records()
# dump the larger geometries, in case you observe that one of them took a really
# long time and you want to make sure you can come up with an explanation. For
# example, if it's a long weird skinny shape, it might have a pretty large
# bounding box, which would make the filter take longer.
# Some of them can take a while -- in testing on 20 random shapes, I observed
# run times ranging from 0.99 seconds to over 5 minutes.
with open('bigGeoms.pkl', 'wb') as outf:
pickle.dump(upper, outf)
print('Getting Census blocks (note: slow, don\'t be alarmed)...')
get_block_records(idx)
print('Merging Census blocks within larger units...')
futures = []
with ThreadPoolExecutor(max_workers=6) as pool:
# run small shapes first because they fly by
for big_shp in sorted(upper, key=lambda x: x['geometry'].area):
filter_fut = pool.submit(filter_blocks_to_enclosure, big_shp, idx)
futures.append(pool.submit(async_runner, big_shp, filter_fut))
res = [f.result() for f in futures]
merged_records = [remove_extra_fields(y['record'])
for x in res for y in x.values()]
out_path = 'merged_shrunk.shp'
write_merged_records('tl_2010_51_block10_3785_join.shp', out_path, merged_records)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment