-
-
Save paleolimbot/9fcdb8e8fee301db6c68d2e32efd54c6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Setup the environment:\n", | |
"\n", | |
"```bash\n", | |
"pip install geoarrow-pyarrow pyogrio\n", | |
"```\n", | |
"\n", | |
"Download the files:\n", | |
"\n", | |
"```bash\n", | |
"curl -L https://github.com/geoarrow/geoarrow-data/releases/download/v0.1.0/microsoft-buildings-point.fgb.zip -o microsoft-buildings-point.fgb.zip\n", | |
"curl -L https://github.com/geoarrow/geoarrow-data/releases/download/v0.1.0/microsoft-buildings-point.arrow -o microsoft-buildings-point.arrow\n", | |
"unzip microsoft-buildings-point.fgb.zip\n", | |
"```\n", | |
"\n", | |
"Create the Parquet version we use in the post:\n", | |
"\n", | |
"```python\n", | |
"# Create the parquet version. Using uncompressed because the .fgb is also uncompressed.\n", | |
"import geoarrow.pyarrow as ga\n", | |
"import pyarrow.feather\n", | |
"import pyarrow.parquet\n", | |
"tab = pyarrow.feather.read_table(\"microsoft-buildings-point.arrow\")\n", | |
"pyarrow.parquet.write_table(tab, \"microsoft-buildings-point.parquet\", compression=\"none\")\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"-rw-r--r-- 1 dewey staff 9.7G Sep 28 16:21 microsoft-buildings-point.fgb\n", | |
"-rw-r--r-- 1 dewey staff 2.0G Oct 5 17:07 microsoft-buildings-point.parquet\n" | |
] | |
} | |
], | |
"source": [ | |
"! ls -lh microsoft-buildings-point.fgb microsoft-buildings-point.parquet" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"17.5 s ± 112 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
"1.87 s ± 185 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"import pyogrio\n", | |
"%timeit pyogrio.raw.read_arrow(\"microsoft-buildings-point.fgb\")\n", | |
"#> 17.7 s ± 133 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
"\n", | |
"import geoarrow.pyarrow as ga\n", | |
"import pyarrow.parquet as pq\n", | |
"%timeit pq.read_table(\"microsoft-buildings-point.parquet\")\n", | |
"#> 1.14 s ± 36.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"13.3 ms ± 62.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"table_geoarrow = pq.read_table(\"microsoft-buildings-point.parquet\")\n", | |
"%timeit lon, lat = ga.point_coords(table_geoarrow[\"geometry\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1.98 s ± 31 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"# pyogrio returns geometry serialized to WKB\n", | |
"info, table_fgb = pyogrio.raw.read_arrow(\"microsoft-buildings-point.fgb\")\n", | |
"fgb_geometry = ga.wkb().wrap_array(table_fgb[\"wkb_geometry\"])\n", | |
"%timeit lon, lat = ga.point_coords(fgb_geometry)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pyarrow import dataset\n", | |
"\n", | |
"dataset.write_dataset(\n", | |
" table_geoarrow, \n", | |
" \"buildings_dataset\",\n", | |
" format=\"parquet\",\n", | |
" partitioning=[\"src_file\"]\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"pyarrow.Table\n", | |
"geometry: extension<geoarrow.point<PointType>>\n", | |
"----\n", | |
"geometry: [ -- is_valid: all not null\n", | |
" -- child 0 type: double\n", | |
"[-86.77559019090732,-86.77567460569023,-86.77582324514928,-86.77575519077948,-86.77562696146198]\n", | |
" -- child 1 type: double\n", | |
"[33.401445966705616,34.76565992742417,33.57713996258858,33.75261157849485,33.351345320566175]]" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dataset.dataset(\"buildings_dataset\").head(5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"993 ms ± 79.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"# From terminal:\n", | |
"# python flight_example_server.py\n", | |
"import pyarrow.flight as flight\n", | |
"client = flight.connect(\"grpc://0.0.0.0:8815\")\n", | |
"\n", | |
"def put_flight(tab):\n", | |
" upload_descriptor = flight.FlightDescriptor.for_path(\"uploaded\")\n", | |
" writer, _ = client.do_put(upload_descriptor, tab.schema)\n", | |
" writer.write_table(tab)\n", | |
" writer.close()\n", | |
"\n", | |
"%timeit put_flight(table_geoarrow)\n", | |
"#> 700 ms ± 118 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"2.85 s ± 99.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"from pyarrow import feather\n", | |
"def put_file(tab):\n", | |
" feather.write_feather(tab, \"temp.arrow\", compression=\"uncompressed\")\n", | |
" feather.read_table(\"temp.arrow\")\n", | |
"\n", | |
"%timeit put_file(table_geoarrow)\n", | |
"#> 2.03 s ± 234 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": ".venv", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.2" | |
}, | |
"orig_nbformat": 4 | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import flight | |
from pyarrow import flight | |
# Register GeoArrow extension types to ensure types are propagated | |
import geoarrow.pyarrow as _ | |
class FlightServer(flight.FlightServerBase): | |
def do_put(self, context, descriptor, reader, writer): | |
# Consume the entire input (to demonstrate how fast the entire | |
# input can be sent from client-to-server or vice versa) | |
reader.read_all() | |
if __name__ == "__main__": | |
server = FlightServer(location="grpc://0.0.0.0:8815") | |
server.serve() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment