Created
May 22, 2023 18:51
-
-
Save cboettig/eaea4b2d9fa5f0315272a1d3051d72a7 to your computer and use it in GitHub Desktop.
Benchmark speed of opening vector of URLs on S3 bucket in arrow and duckdb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(arrow) | |
s3 <- s3_bucket("neon4cast-scores/parquet/aquatics", endpoint_override = "data.ecoforecast.org", anonymous=TRUE) | |
bench::bench_time( # very slow | |
ds <- open_dataset(s3) | |
) | |
bench::bench_time( # very slow, but available via S3 Inventory | |
all_paths <- s3$ls(recursive=TRUE) | |
) | |
all_paths <- all_paths[grepl("[.]parquet", all_paths)] | |
uris <- paste0("s3://neon4cast-scores/parquet/aquatics/", all_paths, "?endpoint_override=data.ecoforecast.org") | |
bench::bench_time( # very slow & uses lots of RAM! | |
open_dataset(uris) | |
) | |
library(duckdb) | |
library(glue) | |
bench::bench_time({ | |
conn <- DBI::dbConnect(duckdb(), ":memory:") | |
DBI::dbExecute(conn, "INSTALL 'httpfs';") | |
DBI::dbExecute(conn, "LOAD 'httpfs';") | |
endpoint <- "data.ecoforecast.org" | |
DBI::dbExecute(conn, glue("SET s3_endpoint='{endpoint}';")) | |
DBI::dbExecute(conn, glue("SET s3_url_style='path';")) | |
parquet <- paste0("[", paste0(paste0("'", paste0("s3://neon4cast-scores/parquet/aquatics/", all_paths), "'"), collapse = ","), "]") | |
tblname <- "forecast_subset" | |
view_query <-glue::glue("CREATE VIEW '{tblname}' ", | |
"AS SELECT * FROM parquet_scan({parquet}, HIVE_PARTITIONING=true);") | |
DBI::dbSendQuery(conn, view_query) | |
ds <- tbl(conn, tblname) | |
}) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment