Skip to content

Instantly share code, notes, and snippets.

@paleolimbot
Last active October 5, 2023 20:19
Show Gist options
  • Save paleolimbot/9fcdb8e8fee301db6c68d2e32efd54c6 to your computer and use it in GitHub Desktop.
Save paleolimbot/9fcdb8e8fee301db6c68d2e32efd54c6 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Setup the environment:\n",
"\n",
"```bash\n",
"pip install geoarrow-pyarrow pyogrio\n",
"```\n",
"\n",
"Download the files:\n",
"\n",
"```bash\n",
"curl -L https://github.com/geoarrow/geoarrow-data/releases/download/v0.1.0/microsoft-buildings-point.fgb.zip -o microsoft-buildings-point.fgb.zip\n",
"curl -L https://github.com/geoarrow/geoarrow-data/releases/download/v0.1.0/microsoft-buildings-point.arrow -o microsoft-buildings-point.arrow\n",
"unzip microsoft-buildings-point.fgb.zip\n",
"```\n",
"\n",
"Create the Parquet version we use in the post:\n",
"\n",
"```python\n",
"# Create the parquet version. Using uncompressed because the .fgb is also uncompressed.\n",
"import geoarrow.pyarrow as ga\n",
"import pyarrow.feather\n",
"import pyarrow.parquet\n",
"tab = pyarrow.feather.read_table(\"microsoft-buildings-point.arrow\")\n",
"pyarrow.parquet.write_table(tab, \"microsoft-buildings-point.parquet\", compression=\"none\")\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-rw-r--r-- 1 dewey staff 9.7G Sep 28 16:21 microsoft-buildings-point.fgb\n",
"-rw-r--r-- 1 dewey staff 2.0G Oct 5 17:07 microsoft-buildings-point.parquet\n"
]
}
],
"source": [
"! ls -lh microsoft-buildings-point.fgb microsoft-buildings-point.parquet"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"17.5 s ± 112 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"1.87 s ± 185 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"import pyogrio\n",
"%timeit pyogrio.raw.read_arrow(\"microsoft-buildings-point.fgb\")\n",
"#> 17.7 s ± 133 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"\n",
"import geoarrow.pyarrow as ga\n",
"import pyarrow.parquet as pq\n",
"%timeit pq.read_table(\"microsoft-buildings-point.parquet\")\n",
"#> 1.14 s ± 36.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"13.3 ms ± 62.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"table_geoarrow = pq.read_table(\"microsoft-buildings-point.parquet\")\n",
"%timeit lon, lat = ga.point_coords(table_geoarrow[\"geometry\"])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.98 s ± 31 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"# pyogrio returns geometry serialized to WKB\n",
"info, table_fgb = pyogrio.raw.read_arrow(\"microsoft-buildings-point.fgb\")\n",
"fgb_geometry = ga.wkb().wrap_array(table_fgb[\"wkb_geometry\"])\n",
"%timeit lon, lat = ga.point_coords(fgb_geometry)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from pyarrow import dataset\n",
"\n",
"dataset.write_dataset(\n",
" table_geoarrow, \n",
" \"buildings_dataset\",\n",
" format=\"parquet\",\n",
" partitioning=[\"src_file\"]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"pyarrow.Table\n",
"geometry: extension<geoarrow.point<PointType>>\n",
"----\n",
"geometry: [ -- is_valid: all not null\n",
" -- child 0 type: double\n",
"[-86.77559019090732,-86.77567460569023,-86.77582324514928,-86.77575519077948,-86.77562696146198]\n",
" -- child 1 type: double\n",
"[33.401445966705616,34.76565992742417,33.57713996258858,33.75261157849485,33.351345320566175]]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.dataset(\"buildings_dataset\").head(5)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"993 ms ± 79.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"# From terminal:\n",
"# python flight_example_server.py\n",
"import pyarrow.flight as flight\n",
"client = flight.connect(\"grpc://0.0.0.0:8815\")\n",
"\n",
"def put_flight(tab):\n",
" upload_descriptor = flight.FlightDescriptor.for_path(\"uploaded\")\n",
" writer, _ = client.do_put(upload_descriptor, tab.schema)\n",
" writer.write_table(tab)\n",
" writer.close()\n",
"\n",
"%timeit put_flight(table_geoarrow)\n",
"#> 700 ms ± 118 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2.85 s ± 99.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"from pyarrow import feather\n",
"def put_file(tab):\n",
" feather.write_feather(tab, \"temp.arrow\", compression=\"uncompressed\")\n",
" feather.read_table(\"temp.arrow\")\n",
"\n",
"%timeit put_file(table_geoarrow)\n",
"#> 2.03 s ± 234 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
# Import flight
from pyarrow import flight
# Register GeoArrow extension types to ensure types are propagated
import geoarrow.pyarrow as _
class FlightServer(flight.FlightServerBase):
def do_put(self, context, descriptor, reader, writer):
# Consume the entire input (to demonstrate how fast the entire
# input can be sent from client-to-server or vice versa)
reader.read_all()
if __name__ == "__main__":
server = FlightServer(location="grpc://0.0.0.0:8815")
server.serve()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment