Skip to content

Instantly share code, notes, and snippets.

@cboettig
Created September 2, 2023 03:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cboettig/a2a52d1cb4c744e397b8417438ae200b to your computer and use it in GitHub Desktop.
Save cboettig/a2a52d1cb4c744e397b8417438ae200b to your computer and use it in GitHub Desktop.
recursive parsing of data.source.coop prefix lists
# Note: this does not compare favorably to s3$ls()
list_directory_tree <- function(directory_path, recursive = TRUE) {
paths <- character(0)
meta <- jsonlite::read_json(directory_path)
entries <- list_paths(meta, directory_path)
paths <- c(paths, entries)
if (recursive && length(entries) > 0) {
for (subdir in meta$prefixes) {
sub_paths <- list_directory_tree(paste0(directory_path,subdir), recursive)
paths <- c(paths, sub_paths)
}
}
return(paths)
}
list_objects <- function(meta) {
vapply(meta$objects, `[[`, "", "url")
}
list_paths <- function(meta, directory_path) {
prefixes <- meta$prefixes
if(length(meta$prefixes) > 0) {
prefixes <- paste0(directory_path, prefixes)
}
objects <- vapply(meta$objects, `[[`, "", "url")
c(prefixes, objects)
}
## Ridiculously slow
benchmarks <- function() {
json <-
bench::bench_time({
directory_path = "https://data.source.coop/eco4cast/neon4cast-scores/parquet/aquatics/"
out <- list_directory_tree(directory_path)
})
library(arrow)
base = "s3://anonymous@us-west-2.opendata.source.coop"
repo = "eco4cast/neon4cast-scores"
theme = "aquatics"
uri = glue::glue("{base}/{repo}/parquet/{theme}?region=us-west-2")
arrow <-
bench::bench_time({
s3 <- arrow::s3_bucket(uri)
paths <- s3$ls(recursive=TRUE)
})
list(json,arrow)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment