jisantuc/simplify.py

## simplify.py
from concurrent.futures import ThreadPoolExecutor, as_completed
import pickle
import time
from uuid import uuid4

import fiona
from rtree import index
from shapely import geometry
from shapely.geometry import shape
from shapely.ops import cascaded_union


def async_runner(big_shp, filter_fut):
    """Simplify records from filter_fut (a future) into larger shapes"""
    start = time.time()
    filtered = filter_fut.result()
    adj = records_to_adjacency(filtered)
    # This seems like a good number of simplifications /shrug
    simplified_4x = simplify_adjacency_dict(
        simplify_adjacency_dict(
            simplify_adjacency_dict(
                simplify_adjacency_dict(adj)
            )
        )
    )
    print('Time to complete {}: {}'.format(big_shp['id'], time.time() - start))
    return simplified_4x


def filter_blocks_to_enclosure(enclosure, index):
    """Return blocks within a shape"""
    candidates = [x.object for x in index.intersection(enclosure['geometry'].bounds, objects=True)]
    return [
        x for x in candidates if x['geometry'].intersection(enclosure['geometry']) == x['geometry']
    ]


def get_block_records(index):
    """Read the blocks shapefile"""
    # for testing on a smaller file
    # return read_records('merged_shrunk.shp', index)
    return read_records('tl_2010_51_block10_3785_join.shp', index)


def get_vtd_records():
    """Read the shapefile for the geolevel above blocks"""
    return read_records('tl_2010_51_vtd10_3785.shp')


def read_records(path, index=None):
    """Read a shapefile, optionally writing each shape within it to a spatial index

    Writing to an index considerably slows down the initial read, but makes filtering
    later _way_ faster
    """
    with fiona.open(path) as reader:
        records = [rec for rec in reader]
        n_records = len(records)
        for i, record in enumerate(records):
            if i % 1000 == 0:
                print('{} / {}'.format(i, n_records))
            record['geometry'] = shape(record['geometry'])
            record['id'] = str(uuid4())
            if index:
                index.insert(i, record['geometry'].bounds, obj=record)
        return records


def records_to_adjacency(records):
    """Transform a list of records into a mapping of record IDs to neighbors"""
    return {
        shp['id']: {
            'record': shp,
            'neighbors': [x for x in records if x['geometry'].touches(shp['geometry'])]
        } for shp in records
    }


def remove_extra_fields(record):
    """Return a dictionary without anything that doesn't belong in a geojson"""
    return {
        k: v for k, v in record.items() if k in ['geometry', 'properties', 'type']
    }


def simplify_adjacency_dict(adjacency_dict, threshold=10):
    """Merge records with their neighbors

    The form of the adjacency dict is assumed to be the same as that returned by
    records_to_adjacency
    """
    if len(adjacency_dict) <= threshold:
        return adjacency_dict
    visited = []
    new_records = []
    for shp_id in adjacency_dict:
        if shp_id in visited:
            continue
        node = adjacency_dict[shp_id]
        node['record']['geometry'] = cascaded_union(
            [node['record']['geometry']] + [x['geometry'] for x in node['neighbors']]
        )
        visited.extend([x['id'] for x in node['neighbors']])
        new_records.append(node['record'])

    return records_to_adjacency(new_records)


def write_merged_records(in_path, out_path, records):
    """Write a collection of records to out_path with in_path's schema"""
    with fiona.open(in_path) as src:
        new_schema = src.schema.copy()
        with fiona.open(
                out_path, 'w', crs=src.crs, driver=src.driver, schema=new_schema
        ) as dst:
            for rec in records:
                rec['geometry'] = geometry.mapping(rec['geometry'])
                dst.write(rec)


def main():
    idx = index.Index()
    print('Getting mid-sized records...')
    upper = get_vtd_records()
    # dump the larger geometries, in case you observe that one of them took a really
    # long time and you want to make sure you can come up with an explanation. For
    # example, if it's a long weird skinny shape, it might have a pretty large
    # bounding box, which would make the filter take longer.
    # Some of them can take a while -- in testing on 20 random shapes, I observed
    # run times ranging from 0.99 seconds to over 5 minutes.
    with open('bigGeoms.pkl', 'wb') as outf:
        pickle.dump(upper, outf)
    print('Getting Census blocks (note: slow, don\'t be alarmed)...')
    get_block_records(idx)

    print('Merging Census blocks within larger units...')
    futures = []
    with ThreadPoolExecutor(max_workers=6) as pool:
        # run small shapes first because they fly by
        for big_shp in sorted(upper, key=lambda x: x['geometry'].area):
            filter_fut = pool.submit(filter_blocks_to_enclosure, big_shp, idx)
            futures.append(pool.submit(async_runner, big_shp, filter_fut))
        res = [f.result() for f in futures]
    merged_records = [remove_extra_fields(y['record'])
                      for x in res for y in x.values()]
    out_path = 'merged_shrunk.shp'
    write_merged_records('tl_2010_51_block10_3785_join.shp', out_path, merged_records)


if __name__ == '__main__':
    main()
	from concurrent.futures import ThreadPoolExecutor, as_completed
	import pickle
	import time
	from uuid import uuid4

	import fiona
	from rtree import index
	from shapely import geometry
	from shapely.geometry import shape
	from shapely.ops import cascaded_union


	def async_runner(big_shp, filter_fut):
	"""Simplify records from filter_fut (a future) into larger shapes"""
	start = time.time()
	filtered = filter_fut.result()
	adj = records_to_adjacency(filtered)
	# This seems like a good number of simplifications /shrug
	simplified_4x = simplify_adjacency_dict(
	simplify_adjacency_dict(
	simplify_adjacency_dict(
	simplify_adjacency_dict(adj)
	)
	)
	)
	print('Time to complete {}: {}'.format(big_shp['id'], time.time() - start))
	return simplified_4x


	def filter_blocks_to_enclosure(enclosure, index):
	"""Return blocks within a shape"""
	candidates = [x.object for x in index.intersection(enclosure['geometry'].bounds, objects=True)]
	return [
	x for x in candidates if x['geometry'].intersection(enclosure['geometry']) == x['geometry']
	]


	def get_block_records(index):
	"""Read the blocks shapefile"""
	# for testing on a smaller file
	# return read_records('merged_shrunk.shp', index)
	return read_records('tl_2010_51_block10_3785_join.shp', index)


	def get_vtd_records():
	"""Read the shapefile for the geolevel above blocks"""
	return read_records('tl_2010_51_vtd10_3785.shp')


	def read_records(path, index=None):
	"""Read a shapefile, optionally writing each shape within it to a spatial index

	Writing to an index considerably slows down the initial read, but makes filtering
	later _way_ faster
	"""
	with fiona.open(path) as reader:
	records = [rec for rec in reader]
	n_records = len(records)
	for i, record in enumerate(records):
	if i % 1000 == 0:
	print('{} / {}'.format(i, n_records))
	record['geometry'] = shape(record['geometry'])
	record['id'] = str(uuid4())
	if index:
	index.insert(i, record['geometry'].bounds, obj=record)
	return records


	def records_to_adjacency(records):
	"""Transform a list of records into a mapping of record IDs to neighbors"""
	return {
	shp['id']: {
	'record': shp,
	'neighbors': [x for x in records if x['geometry'].touches(shp['geometry'])]
	} for shp in records
	}


	def remove_extra_fields(record):
	"""Return a dictionary without anything that doesn't belong in a geojson"""
	return {
	k: v for k, v in record.items() if k in ['geometry', 'properties', 'type']
	}


	def simplify_adjacency_dict(adjacency_dict, threshold=10):
	"""Merge records with their neighbors

	The form of the adjacency dict is assumed to be the same as that returned by
	records_to_adjacency
	"""
	if len(adjacency_dict) <= threshold:
	return adjacency_dict
	visited = []
	new_records = []
	for shp_id in adjacency_dict:
	if shp_id in visited:
	continue
	node = adjacency_dict[shp_id]
	node['record']['geometry'] = cascaded_union(
	[node['record']['geometry']] + [x['geometry'] for x in node['neighbors']]
	)
	visited.extend([x['id'] for x in node['neighbors']])
	new_records.append(node['record'])

	return records_to_adjacency(new_records)


	def write_merged_records(in_path, out_path, records):
	"""Write a collection of records to out_path with in_path's schema"""
	with fiona.open(in_path) as src:
	new_schema = src.schema.copy()
	with fiona.open(
	out_path, 'w', crs=src.crs, driver=src.driver, schema=new_schema
	) as dst:
	for rec in records:
	rec['geometry'] = geometry.mapping(rec['geometry'])
	dst.write(rec)


	def main():
	idx = index.Index()
	print('Getting mid-sized records...')
	upper = get_vtd_records()
	# dump the larger geometries, in case you observe that one of them took a really
	# long time and you want to make sure you can come up with an explanation. For
	# example, if it's a long weird skinny shape, it might have a pretty large
	# bounding box, which would make the filter take longer.
	# Some of them can take a while -- in testing on 20 random shapes, I observed
	# run times ranging from 0.99 seconds to over 5 minutes.
	with open('bigGeoms.pkl', 'wb') as outf:
	pickle.dump(upper, outf)
	print('Getting Census blocks (note: slow, don\'t be alarmed)...')
	get_block_records(idx)

	print('Merging Census blocks within larger units...')
	futures = []
	with ThreadPoolExecutor(max_workers=6) as pool:
	# run small shapes first because they fly by
	for big_shp in sorted(upper, key=lambda x: x['geometry'].area):
	filter_fut = pool.submit(filter_blocks_to_enclosure, big_shp, idx)
	futures.append(pool.submit(async_runner, big_shp, filter_fut))
	res = [f.result() for f in futures]
	merged_records = [remove_extra_fields(y['record'])
	for x in res for y in x.values()]
	out_path = 'merged_shrunk.shp'
	write_merged_records('tl_2010_51_block10_3785_join.shp', out_path, merged_records)


	if __name__ == '__main__':
	main()