Skip to content

Instantly share code, notes, and snippets.

@kylebarron
Last active April 24, 2024 06:38
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save kylebarron/428c533528646344181d6fec9828a558 to your computer and use it in GitHub Desktop.
Save kylebarron/428c533528646344181d6fec9828a558 to your computer and use it in GitHub Desktop.
preprocessing script for geoparquet on the web demo (https://observablehq.com/@kylebarron/geoparquet-on-the-web)
import geopandas as gpd
import pandas as pd
import pyarrow as pa
import pyarrow.feather as feather
import pyarrow.parquet as pq
import pygeos
import pyogrio
# https://ookla-open-data.s3.us-west-2.amazonaws.com/parquet/performance/type=mobile/year=2019/quarter=1/2019-01-01_performance_mobile_tiles.parquet
df = pd.read_parquet("2019-01-01_performance_mobile_tiles.parquet")
centroids = pygeos.centroid(pygeos.from_wkt(df["tile"]))
# Save space by using a smaller data type
df_cols = ["avg_d_kbps", "avg_u_kbps", "avg_lat_ms"]
for col in df_cols:
df[col] = pd.to_numeric(df[col], downcast="unsigned")
df = df[df_cols]
gdf = gpd.GeoDataFrame(df[df_cols], geometry=centroids)
def construct_arrow_table(df: pd.DataFrame, geometry) -> pa.Table:
# Note in this quick example we omit metadata on the table header
table = pa.Table.from_pandas(df)
coords = pygeos.get_coordinates(geometry)
parr = pa.FixedSizeListArray.from_arrays(coords.flat, 2)
return table.append_column("geometry", parr)
# GeoParquet
%%time
geoarrow_table = construct_arrow_table(df, centroids)
pq.write_table(
geoarrow_table,
"2019-01-01_performance_mobile_tiles_centroids_brotli.parquet",
row_group_size=len(geoarrow_table),
compression="brotli",
)
# CPU times: user 3.34 s, sys: 210 ms, total: 3.55 s
# Wall time: 3.61 s
# GeoPackage
%%time
pyogrio.write_dataframe(
gdf,
"2019-01-01_performance_mobile_tiles_centroids_vectorized.gpkg",
index=False,
driver="GPKG",
layer="layer1",
)
# CPU times: user 1min 16s, sys: 41.6 s, total: 1min 57s
# Wall time: 2min
%%time
gdf.to_file(
"2019-01-01_performance_mobile_tiles_centroids.gpkg",
index=False,
driver="GPKG",
layer="layer1",
)
# CPU times: user 5min 13s, sys: 48.3 s, total: 6min 1s
# Wall time: 6min 10s
# FlatGeobuf
%%time
pyogrio.write_dataframe(
gdf,
"2019-01-01_performance_mobile_tiles_centroids_vectorized.fgb",
index=False,
driver="FlatGeobuf",
spatial_index="NO",
)
# CPU times: user 12.2 s, sys: 730 ms, total: 13 s
# Wall time: 13.1 s
%%time
gdf.to_file(
"2019-01-01_performance_mobile_tiles_centroids.fgb",
index=False,
driver="FlatGeobuf",
spatial_index="NO",
)
# CPU times: user 3min 56s, sys: 2.99 s, total: 3min 59s
# Wall time: 4min 2s
# GeoJSON
%%time
with open("2019-01-01_performance_mobile_tiles_centroids.json", "w") as f:
f.write(gdf.to_json())
# Shapefile
%%time
pyogrio.write_dataframe(
gdf, "2019-01-01_performance_mobile_tiles_centroids_vectorized.shp"
)
# CPU times: user 19.7 s, sys: 12.5 s, total: 32.1 s
# Wall time: 32.7 s
%%time
gdf.to_file(
"2019-01-01_performance_mobile_tiles_centroids.shp",
index=False,
driver="ESRI Shapefile",
)
# CPU times: user 4min 44s, sys: 27.9 s, total: 5min 12s
# Wall time: 5min 33s
# Additional time to zip on the command line
# time zip out.zip 2019-01-01_performance_mobile_tiles_centroids.shp 2019-01-01_performance_mobile_tiles_centroids.shx 2019-01-01_performance_mobile_tiles_centroids.cpg 2019-01-01_performance_mobile_tiles_centroids.dbf
# 12 seconds
# Total time: 5min 45s
# Feather/Arrow IPC File
# Force uncompressed because Arrow JS doesn't yet support compressions
%%time
geoarrow_table = construct_arrow_table(df, centroids)
feather.write_feather(
geoarrow_table,
"2019-01-01_performance_mobile_tiles_centroids.feather",
compression="uncompressed",
)
# CPU times: user 578 ms, sys: 142 ms, total: 720 ms
# Wall time: 752 ms
# Additional time with gzip -c: 4.6s
# Total time: 5.3s
@theroggy
Copy link

Probably it would be more logical to pass spatial_index="NO" for all formats to get some more usable comparisons...

E.g. for Geopackage this will give a huge difference...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment