Skip to content

Instantly share code, notes, and snippets.

@cboettig
Created May 22, 2023 18:51
Show Gist options
  • Save cboettig/eaea4b2d9fa5f0315272a1d3051d72a7 to your computer and use it in GitHub Desktop.
Save cboettig/eaea4b2d9fa5f0315272a1d3051d72a7 to your computer and use it in GitHub Desktop.
Benchmark speed of opening vector of URLs on S3 bucket in arrow and duckdb
library(arrow)
s3 <- s3_bucket("neon4cast-scores/parquet/aquatics", endpoint_override = "data.ecoforecast.org", anonymous=TRUE)
bench::bench_time( # very slow
ds <- open_dataset(s3)
)
bench::bench_time( # very slow, but available via S3 Inventory
all_paths <- s3$ls(recursive=TRUE)
)
all_paths <- all_paths[grepl("[.]parquet", all_paths)]
uris <- paste0("s3://neon4cast-scores/parquet/aquatics/", all_paths, "?endpoint_override=data.ecoforecast.org")
bench::bench_time( # very slow & uses lots of RAM!
open_dataset(uris)
)
library(duckdb)
library(glue)
bench::bench_time({
conn <- DBI::dbConnect(duckdb(), ":memory:")
DBI::dbExecute(conn, "INSTALL 'httpfs';")
DBI::dbExecute(conn, "LOAD 'httpfs';")
endpoint <- "data.ecoforecast.org"
DBI::dbExecute(conn, glue("SET s3_endpoint='{endpoint}';"))
DBI::dbExecute(conn, glue("SET s3_url_style='path';"))
parquet <- paste0("[", paste0(paste0("'", paste0("s3://neon4cast-scores/parquet/aquatics/", all_paths), "'"), collapse = ","), "]")
tblname <- "forecast_subset"
view_query <-glue::glue("CREATE VIEW '{tblname}' ",
"AS SELECT * FROM parquet_scan({parquet}, HIVE_PARTITIONING=true);")
DBI::dbSendQuery(conn, view_query)
ds <- tbl(conn, tblname)
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment