Carl Boettiger cboettig

## duckdb-remotes.R
library(duckdb)
library(DBI)
library(glue)
library(dplyr)
conn <- DBI::dbConnect(duckdb(), ":memory:",
                       config=list("memory_limit"="12GB",
                                   "temp_directory" = "/tmp"))
DBI::dbExecute(conn, "INSTALL 'httpfs';")
DBI::dbExecute(conn, "LOAD 'httpfs';")

## rspm-focal.R
options(HTTPUserAgent="R/4.2.1 R (4.2.1 x86_64-pc-linux-gnu x86_64 linux-gnu)",
        repos=c(CRAN="https://packagemanager.rstudio.com/cran/__linux__/focal/latest"))

## better-scoring.R


generic_mean <- function(family, parameter, predicted) {
  names(predicted) = parameter
  switch(unique(family),
         norm = predicted["mean"],
         sample = mean(predicted)
  )
}
generic_sd <- function(family, parameter, predicted) {

## alternate-method.R
## using terra::animate(), but no idea how to save that to a gif?
library(terra)
library(viridisLite)
library(dplyr)
library(gbifdb)

db <- gbif_remote()
df <- db |> mutate(latitude = round(decimallatitude),
                   longitude = round(decimallongitude)) |>
  count(longitude, latitude, year) |>

## ebird-occurrences.R
library(arrow)
library(dplyr)
library(terra)

path <- arrow::s3_bucket("ebird/observations", endpoint_override = "minio.carlboettiger.info")
obs <- arrow::open_dataset(path) |> to_duckdb()
df <- obs |> mutate(latitude = round(latitude, digits=1),
                   longitude = round(longitude, digits=1)) |>
  count(longitude, latitude) |> collect() |>
  mutate(n = log(n))

## birds.R
library(tidyverse)
#download manually: "https://www.pnas.org/doi/suppl/10.1073/pnas.2113862119/suppl_file/pnas.2113862119.sd01.csv"

df <- read_csv("pnas.2113862119.sd01.csv", skip=2, na=c("", "#N/A"),
               col_names=
                 c("species", "birdlife_min", "birdlife_max", "matched",
                   "calaghan_min", "calaghan_mean", "calaghan_max"),
                 col_types = "cddcddd")

outside <- df |> na.omit() |>

## weather.R
# Layers correspond to this table:
# https://www.nco.ncep.noaa.gov/pmb/products/gens/geavg.t00z.pgrb2a.0p50.f000.shtml

# Data naming conventions see: https://www.nco.ncep.noaa.gov/pmb/products/gens/
library(terra)
base <- "https://noaa-gefs-pds.s3.amazonaws.com/"
date <- "20220316"
cycle <- "00"    # 00, 06, 12, 18 hr issued
horizon <- "000" # 000:384 hrs ahead
series <- "atmos"

## gdalcubes.R
library(rstac)
library(gdalcubes)

## STAC Search over 400 million assets.
box <- c(xmin=-122.51006, ymin=37.70801, xmax=-122.36268, ymax=37.80668)
items =
  stac("https://earth-search.aws.element84.com/v0/") |>
  stac_search(collections = "sentinel-s2-l2a-cogs",
              bbox = box,
              datetime = "2020-06-01/2020-08-01",

## eml_methods.xml
<?xml version="1.0"?>
<met:methods
    xmlns:met="https://eml.ecoinformatics.org/methods-2.2.0"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="https://eml.ecoinformatics.org/methods-2.2.0 eml-methods.xsd">

    <methodStep>
      <description><para>
        `temperature` is defined as the average water surface water temperature measured in the top 1m of water, using the 30-minute averages of `tsdWaterTempMean` from `TSD_30_min` table of NEON product `DP1.20053.001`, which is measured by a platinum resistance thermometer   NEON data collection and instrument information is defined in https://data.neonscience.org/data-products/DP1.20053.001.
        </para></description>

## README.md

      
              2 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                cboettig
                / README.md
            
            
              Created
              December 16, 2021 18:50
            
          
    This benchmark considers the case of reading a few thousand csv files (all using a common schema) into a single table.
The data is about 2.4 million rows, and about 100MB as parquet or 400 MB as single uncompressed csv.
We compare access directly over the POSIX filesystem to S3-based access over a local network
(i.e. where the same server is hosting a MINIO bucket and the RStudio instance on which we run the tests).
We also compare these times against serializing the collected csv files into a single file first, either as parquet, csv,
or compressed .csv.gz first, and reading that.
Note that the goal here is not a comparison of arrow vs readr, but really a comparison of the costs of local network-based access
	library(duckdb)
	library(DBI)
	library(glue)
	library(dplyr)
	conn <- DBI::dbConnect(duckdb(), ":memory:",
	config=list("memory_limit"="12GB",
	"temp_directory" = "/tmp"))
	DBI::dbExecute(conn, "INSTALL 'httpfs';")
	DBI::dbExecute(conn, "LOAD 'httpfs';")
	options(HTTPUserAgent="R/4.2.1 R (4.2.1 x86_64-pc-linux-gnu x86_64 linux-gnu)",
	repos=c(CRAN="https://packagemanager.rstudio.com/cran/__linux__/focal/latest"))


	generic_mean <- function(family, parameter, predicted) {
	names(predicted) = parameter
	switch(unique(family),
	norm = predicted["mean"],
	sample = mean(predicted)
	)
	}
	generic_sd <- function(family, parameter, predicted) {
	## using terra::animate(), but no idea how to save that to a gif?
	library(terra)
	library(viridisLite)
	library(dplyr)
	library(gbifdb)

	db <- gbif_remote()
	df <- db \|> mutate(latitude = round(decimallatitude),
	longitude = round(decimallongitude)) \|>
	count(longitude, latitude, year) \|>
	library(arrow)
	library(dplyr)
	library(terra)

	path <- arrow::s3_bucket("ebird/observations", endpoint_override = "minio.carlboettiger.info")
	obs <- arrow::open_dataset(path) \|> to_duckdb()
	df <- obs \|> mutate(latitude = round(latitude, digits=1),
	longitude = round(longitude, digits=1)) \|>
	count(longitude, latitude) \|> collect() \|>
	mutate(n = log(n))
	library(tidyverse)
	#download manually: "https://www.pnas.org/doi/suppl/10.1073/pnas.2113862119/suppl_file/pnas.2113862119.sd01.csv"

	df <- read_csv("pnas.2113862119.sd01.csv", skip=2, na=c("", "#N/A"),
	col_names=
	c("species", "birdlife_min", "birdlife_max", "matched",
	"calaghan_min", "calaghan_mean", "calaghan_max"),
	col_types = "cddcddd")

	outside <- df \|> na.omit() \|>
	# Layers correspond to this table:
	# https://www.nco.ncep.noaa.gov/pmb/products/gens/geavg.t00z.pgrb2a.0p50.f000.shtml

	# Data naming conventions see: https://www.nco.ncep.noaa.gov/pmb/products/gens/
	library(terra)
	base <- "https://noaa-gefs-pds.s3.amazonaws.com/"
	date <- "20220316"
	cycle <- "00" # 00, 06, 12, 18 hr issued
	horizon <- "000" # 000:384 hrs ahead
	series <- "atmos"
	library(rstac)
	library(gdalcubes)

	## STAC Search over 400 million assets.
	box <- c(xmin=-122.51006, ymin=37.70801, xmax=-122.36268, ymax=37.80668)
	items =
	stac("https://earth-search.aws.element84.com/v0/") \|>
	stac_search(collections = "sentinel-s2-l2a-cogs",
	bbox = box,
	datetime = "2020-06-01/2020-08-01",
	<?xml version="1.0"?>
	<met:methods
	xmlns:met="https://eml.ecoinformatics.org/methods-2.2.0"
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="https://eml.ecoinformatics.org/methods-2.2.0 eml-methods.xsd">

	<methodStep>
	<description><para>
	`temperature` is defined as the average water surface water temperature measured in the top 1m of water, using the 30-minute averages of `tsdWaterTempMean` from `TSD_30_min` table of NEON product `DP1.20053.001`, which is measured by a platinum resistance thermometer NEON data collection and instrument information is defined in https://data.neonscience.org/data-products/DP1.20053.001.
	</para></description>