Created
January 4, 2023 12:21
-
-
Save TomAugspurger/395e5512afad72cf3bb9e10919a50e35 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "ce999f0d-0ed0-4413-98bd-bde7078721ee", | |
"metadata": {}, | |
"source": [ | |
"# Add Geoparquet\n", | |
"\n", | |
"This manually adds [geoparquet](https://github.com/opengeospatial/geoparquet) metadata to the Microsoft Buildings dataset." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"id": "c6fc46ed-419f-41d4-8c16-de6f6240a8b8", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import dask.dataframe as dd\n", | |
"import adlfs\n", | |
"import geopandas\n", | |
"import dask_geopandas\n", | |
"import pyarrow.fs\n", | |
"import pyproj\n", | |
"import json\n", | |
"import dask.distributed\n", | |
"\n", | |
"storage_options=dict(account_name=\"bingmlbuildings\", credential=None)\n", | |
"\n", | |
"def add_geo_metadata(prefix, geometry_name, storage_options):\n", | |
" fs = adlfs.AzureBlobFileSystem(**storage_options)\n", | |
" arrow_fs = pyarrow.fs.PyFileSystem(pyarrow.fs.FSSpecHandler(fs))\n", | |
" \n", | |
" for path in fs.find(prefix):\n", | |
" table = pyarrow.parquet.read_table(path, filesystem=fs)\n", | |
"\n", | |
" if b\"geo\" in table.schema.metadata:\n", | |
" # Already present\n", | |
" continue\n", | |
"\n", | |
" bbox = list(geopandas.array.from_wkb(table[geometry_name].to_numpy()).total_bounds)\n", | |
"\n", | |
" geo_metadata = {\n", | |
" \"version\": \"0.4.0\",\n", | |
" \"primary_column\": geometry_name,\n", | |
" \"columns\": {\n", | |
" geometry_name: {\n", | |
" \"encoding\": \"WKB\",\n", | |
" \"crs\": pyproj.CRS(\"WGS 84\").to_json(),\n", | |
" \"geometry_type\": \"POLYGON\",\n", | |
" \"bbox\": bbox,\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" metadata = {\n", | |
" **table.schema.metadata,\n", | |
" b\"geo\": json.dumps(geo_metadata).encode(),\n", | |
" }\n", | |
" new_table = table.replace_schema_metadata(metadata)\n", | |
" # TODO: robust error handling here\n", | |
" pyarrow.parquet.write_table(new_table, path, filesystem=arrow_fs)\n", | |
" return prefix" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "684bf72f-acd0-466b-a96e-f15d69aa337b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1.69 ms, sys: 93 µs, total: 1.78 ms\n", | |
"Wall time: 1.55 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"\n", | |
"fs = adlfs.AzureBlobFileSystem(**storage_options)\n", | |
"PREFIX = \"footprints/global/2022-07-06/ml-buildings.parquet/\"\n", | |
"regions = [x for x in fs.ls(PREFIX) if not x.endswith(\"_SUCCESS\")]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"id": "73c600f5-62f2-481b-b076-a383f9c0ec64", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"214" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(regions)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"id": "4e9dc928-23ce-43da-93a2-df9a1db47950", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "b32ebf184e3149679efe45f2a101e604", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"VBox(children=(HTML(value='<h2>GatewayCluster</h2>'), HBox(children=(HTML(value='\\n<div>\\n<style scoped>\\n …" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"import dask_gateway\n", | |
"\n", | |
"cluster = dask_gateway.GatewayCluster()\n", | |
"client = cluster.get_client()\n", | |
"cluster.scale(64)\n", | |
"cluster" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"id": "cee8b19b-a0b8-4440-a837-8d2a9cd99408", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 2.92 s, sys: 258 ms, total: 3.17 s\n", | |
"Wall time: 30min 4s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"\n", | |
"futures = client.map(add_geo_metadata, regions, geometry_name=\"geometry\", storage_options=storage_options)\n", | |
"\n", | |
"_ = dask.distributed.wait(futures)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"id": "a5626fd8-013a-43c0-ba63-3b9fdef27b41", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"cluster.close()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.8" | |
}, | |
"widgets": { | |
"application/vnd.jupyter.widget-state+json": { | |
"state": {}, | |
"version_major": 2, | |
"version_minor": 0 | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment