Skip to content

Instantly share code, notes, and snippets.

@cboettig
cboettig / arrow-duckdb-uri-benchmarks.R
Created May 22, 2023 18:51
Benchmark speed of opening vector of URLs on S3 bucket in arrow and duckdb
library(arrow)
s3 <- s3_bucket("neon4cast-scores/parquet/aquatics", endpoint_override = "data.ecoforecast.org", anonymous=TRUE)
bench::bench_time( # very slow
ds <- open_dataset(s3)
)
bench::bench_time( # very slow, but available via S3 Inventory
library(rstac)
library(gdalcubes)
# let's pick some spatial extent, say, a bbox around "India"
loc <- spData::world |> dplyr::filter(name_long == "India")
box <- sf::st_bbox(loc)
matches <-
stac("https://planetarycomputer.microsoft.com/api/stac/v1") |>
stac_search(collections = "io-biodiversity",
library(tidyverse)
direct <- read_csv("https://www.livingplanetindex.org/session/2ccf2bec98c16e6ee8f9ed6bbd8a514d/download/downloadData?w=")
direct |>
ggplot(aes(Year, LPI_final)) + geom_line() + geom_ribbon(aes(ymin=CI_low, ymax=CI_high), fill="blue", alpha=0.1)
direct |> rename(year = Year) |> mutate(sample = LPI_final + ((CI_high - CI_low)/2) * (runif(n(), -10,10))/10 ) |>
ggplot(aes(x = year, y = 1, fill = sample)) +
geom_tile(show.legend = FALSE) +
#scale_fill_steps2(low = "#c7cca5", mid="#ddff03", high="#53de02", midpoint = median(lpi_df$LPI_final)) +
scale_fill_stepsn(colors = c("#c7cca5", "#ddff03", "#53de02"), values = rescale(c(min(lpi_df$sample), 0, max(lpi_df$sample))),
library(duckdb)
library(DBI)
library(glue)
library(dplyr)
conn <- DBI::dbConnect(duckdb(), ":memory:",
config=list("memory_limit"="12GB",
"temp_directory" = "/tmp"))
DBI::dbExecute(conn, "INSTALL 'httpfs';")
DBI::dbExecute(conn, "LOAD 'httpfs';")
#url <- "https://dap.tern.org.au/thredds/ncss/ecosystem_process/ozflux/AdelaideRiver/2022_v2/L5/default/AdelaideRiver_L5_20071017_20090524.nc"
library(tidync)
library(lubridate)
library(dplyr)
url <- "https://dap.tern.org.au/thredds/dodsC/ecosystem_process/ozflux/AdelaideRiver/2022_v2/L5/default/AdelaideRiver_L5_20071017_20090524.nc"
# peek at metadata for time units
nc_atts(url) |> filter(variable=="time") |> mutate(value = unlist(value))
library(rstac)
library(gdalcubes)
library(stars)
library(tmap)
## STAC Search over 400 million assets.
box <- c(xmin=-122.51006, ymin=37.70801, xmax=-122.36268, ymax=37.80668)
start_date <- "2022-06-01"
end_date <- "2022-08-01"
# remotes::install_github("OldLipe/rstac@b-0.9.1")
# remotes::install_github("appelmar/gdalcubes_R")
# remotes::install_github("r-spatial/stars")
library(tidyverse)
library(rstac)
library(gdalcubes)
library(tmap)
library(stars)
data(World) #polygons
library(gdalcubes)
library(gefs4cast) # remotes::install_github("neon4cast/gefs4cast")
library(stringr)
library(lubridate)
## 3-hr period up to 10 days, (then every 6 hrs up to 35 day horizon)
gefs_cog("~/gefs_cog", ens_avg=TRUE,
max_horizon=240, date = Sys.Date() - 1) |> system.time()
# 000 file has different bands and so cannot be stacked into the collection
@cboettig
cboettig / knitr_defaults.R
Last active November 30, 2022 09:16
My common knitr defaults
# My preferred defaults (may be changed in individual chunks)
opts_chunk$set(tidy=FALSE, warning=FALSE, message=FALSE, cache=TRUE,
comment=NA, verbose=TRUE, fig.width=6, fig.height=4)
# Name the cache path and fig.path based on filename...
opts_chunk$set(fig.path = paste("figure/",
gsub(".Rmd", "", knitr:::knit_concord$get('infile')),
"-", sep=""),
cache.path = paste(gsub(".Rmd", "", knitr:::knit_concord$get('infile') ),
"/", sep=""))
library(rstac)
library(gdalcubes)
## STAC Search over 400 million assets.
box <- c(xmin=-122.51006, ymin=37.70801, xmax=-122.36268, ymax=37.80668)
items =
stac("https://earth-search.aws.element84.com/v0/") |>
stac_search(collections = "sentinel-s2-l2a-cogs",
bbox = box,
datetime = "2020-06-01/2020-08-01",