Skip to content

Instantly share code, notes, and snippets.

@TomAugspurger
Created May 1, 2023 14:33
Show Gist options
  • Save TomAugspurger/4197f2849ee39b50aef61b4ee8575e3f to your computer and use it in GitHub Desktop.
Save TomAugspurger/4197f2849ee39b50aef61b4ee8575e3f to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "a5951d7e-39c0-4b93-86af-e0e7e6c09d19",
"metadata": {},
"source": [
"# Delta Table format for MS Buildings\n",
"\n",
"The MS buildings dataset contains building footprint polygons that are partitioned by Country and quadykey."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b05831c5-1d63-4822-ae51-b1275d94870f",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"!pip install -U deltalake pyquadkey2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8bee92e5-b95d-44bd-9c7d-fd2ab41a7fb7",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import planetary_computer\n",
"import geopandas\n",
"import pandas as pd\n",
"import fiona\n",
"import deltalake"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "759cb87a-031a-4fdf-9115-fa6ee662a587",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 93.3 ms, sys: 74.6 ms, total: 168 ms\n",
"Wall time: 455 ms\n"
]
}
],
"source": [
"sas_token = planetary_computer.sas.get_token(\"bingmlbuildings\", \"footprints\").token\n",
"storage_options = {\"account_name\": \"bingmlbuildings\", \"sas_token\": sas_token}\n",
"%time dt = deltalake.DeltaTable(\"abfs://footprints/delta/2023-04-25/ml-buildings.parquet/\", storage_options=storage_options)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "061efb7c-62b3-4fb6-99b2-d03d9ea619b8",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 5.17 ms, sys: 531 µs, total: 5.7 ms\n",
"Wall time: 5.65 ms\n"
]
}
],
"source": [
"quadkeys = [122110222, 120333320]\n",
"\n",
"%time uris = dt.file_uris(partition_filters=[(\"RegionName\", \"=\", \"Turkey\"), (\"quadkey\", \"in\", quadkeys)])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "8c4d820f-9a35-47ab-ab00-d884925ac4a3",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 322 ms, sys: 66.8 ms, total: 389 ms\n",
"Wall time: 754 ms\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>geometry</th>\n",
" <th>meanHeight</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>POLYGON ((34.34346 36.66417, 34.34359 36.66420...</td>\n",
" <td>-1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>POLYGON ((34.39339 37.06736, 34.39354 37.06730...</td>\n",
" <td>-1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>POLYGON ((34.11536 36.88733, 34.11535 36.88741...</td>\n",
" <td>-1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>POLYGON ((34.37202 36.64425, 34.37212 36.64428...</td>\n",
" <td>-1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>POLYGON ((34.41499 36.83042, 34.41494 36.83053...</td>\n",
" <td>-1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" geometry meanHeight\n",
"0 POLYGON ((34.34346 36.66417, 34.34359 36.66420... -1.0\n",
"1 POLYGON ((34.39339 37.06736, 34.39354 37.06730... -1.0\n",
"2 POLYGON ((34.11536 36.88733, 34.11535 36.88741... -1.0\n",
"3 POLYGON ((34.37202 36.64425, 34.37212 36.64428... -1.0\n",
"4 POLYGON ((34.41499 36.83042, 34.41494 36.83053... -1.0"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"import pandas as pd\n",
"import geopandas\n",
"\n",
"dfs = [pd.read_parquet(uri, storage_options=storage_options) for uri in uris]\n",
"df = pd.concat(dfs)\n",
"gdf = geopandas.GeoDataFrame(df, geometry=geopandas.GeoSeries.from_wkb(df.geometry), crs=\"WGS84\")\n",
"gdf.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {},
"version_major": 2,
"version_minor": 0
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment