Last active
August 31, 2024 04:55
-
-
Save kylebarron/428c533528646344181d6fec9828a558 to your computer and use it in GitHub Desktop.
preprocessing script for geoparquet on the web demo (https://observablehq.com/@kylebarron/geoparquet-on-the-web)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import geopandas as gpd | |
import pandas as pd | |
import pyarrow as pa | |
import pyarrow.feather as feather | |
import pyarrow.parquet as pq | |
import pygeos | |
import pyogrio | |
# https://ookla-open-data.s3.us-west-2.amazonaws.com/parquet/performance/type=mobile/year=2019/quarter=1/2019-01-01_performance_mobile_tiles.parquet | |
df = pd.read_parquet("2019-01-01_performance_mobile_tiles.parquet") | |
centroids = pygeos.centroid(pygeos.from_wkt(df["tile"])) | |
# Save space by using a smaller data type | |
df_cols = ["avg_d_kbps", "avg_u_kbps", "avg_lat_ms"] | |
for col in df_cols: | |
df[col] = pd.to_numeric(df[col], downcast="unsigned") | |
df = df[df_cols] | |
gdf = gpd.GeoDataFrame(df[df_cols], geometry=centroids) | |
def construct_arrow_table(df: pd.DataFrame, geometry) -> pa.Table: | |
# Note in this quick example we omit metadata on the table header | |
table = pa.Table.from_pandas(df) | |
coords = pygeos.get_coordinates(geometry) | |
parr = pa.FixedSizeListArray.from_arrays(coords.flat, 2) | |
return table.append_column("geometry", parr) | |
# GeoParquet | |
%%time | |
geoarrow_table = construct_arrow_table(df, centroids) | |
pq.write_table( | |
geoarrow_table, | |
"2019-01-01_performance_mobile_tiles_centroids_brotli.parquet", | |
row_group_size=len(geoarrow_table), | |
compression="brotli", | |
) | |
# CPU times: user 3.34 s, sys: 210 ms, total: 3.55 s | |
# Wall time: 3.61 s | |
# GeoPackage | |
%%time | |
pyogrio.write_dataframe( | |
gdf, | |
"2019-01-01_performance_mobile_tiles_centroids_vectorized.gpkg", | |
index=False, | |
driver="GPKG", | |
layer="layer1", | |
) | |
# CPU times: user 1min 16s, sys: 41.6 s, total: 1min 57s | |
# Wall time: 2min | |
%%time | |
gdf.to_file( | |
"2019-01-01_performance_mobile_tiles_centroids.gpkg", | |
index=False, | |
driver="GPKG", | |
layer="layer1", | |
) | |
# CPU times: user 5min 13s, sys: 48.3 s, total: 6min 1s | |
# Wall time: 6min 10s | |
# FlatGeobuf | |
%%time | |
pyogrio.write_dataframe( | |
gdf, | |
"2019-01-01_performance_mobile_tiles_centroids_vectorized.fgb", | |
index=False, | |
driver="FlatGeobuf", | |
spatial_index="NO", | |
) | |
# CPU times: user 12.2 s, sys: 730 ms, total: 13 s | |
# Wall time: 13.1 s | |
%%time | |
gdf.to_file( | |
"2019-01-01_performance_mobile_tiles_centroids.fgb", | |
index=False, | |
driver="FlatGeobuf", | |
spatial_index="NO", | |
) | |
# CPU times: user 3min 56s, sys: 2.99 s, total: 3min 59s | |
# Wall time: 4min 2s | |
# GeoJSON | |
%%time | |
with open("2019-01-01_performance_mobile_tiles_centroids.json", "w") as f: | |
f.write(gdf.to_json()) | |
# Shapefile | |
%%time | |
pyogrio.write_dataframe( | |
gdf, "2019-01-01_performance_mobile_tiles_centroids_vectorized.shp" | |
) | |
# CPU times: user 19.7 s, sys: 12.5 s, total: 32.1 s | |
# Wall time: 32.7 s | |
%%time | |
gdf.to_file( | |
"2019-01-01_performance_mobile_tiles_centroids.shp", | |
index=False, | |
driver="ESRI Shapefile", | |
) | |
# CPU times: user 4min 44s, sys: 27.9 s, total: 5min 12s | |
# Wall time: 5min 33s | |
# Additional time to zip on the command line | |
# time zip out.zip 2019-01-01_performance_mobile_tiles_centroids.shp 2019-01-01_performance_mobile_tiles_centroids.shx 2019-01-01_performance_mobile_tiles_centroids.cpg 2019-01-01_performance_mobile_tiles_centroids.dbf | |
# 12 seconds | |
# Total time: 5min 45s | |
# Feather/Arrow IPC File | |
# Force uncompressed because Arrow JS doesn't yet support compressions | |
%%time | |
geoarrow_table = construct_arrow_table(df, centroids) | |
feather.write_feather( | |
geoarrow_table, | |
"2019-01-01_performance_mobile_tiles_centroids.feather", | |
compression="uncompressed", | |
) | |
# CPU times: user 578 ms, sys: 142 ms, total: 720 ms | |
# Wall time: 752 ms | |
# Additional time with gzip -c: 4.6s | |
# Total time: 5.3s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Probably it would be more logical to pass spatial_index="NO" for all formats to get some more usable comparisons...
E.g. for Geopackage this will give a huge difference...