jhpoelen/count_geese.R

## count_geese.R

prepare_ebird_2018_id <- function() {
  ebird_data_location <- "http://ebirddata.ornith.cornell.edu/downloads/gbiff/dwca-1.0.zip"
  # unfortunately, the eBird URL no longer work, but,
  # using a time machine, we went back in time and republished data via Zenodo from 2018
  ebird_data_location <- "https://zenodo.org/record/3858251/files/dwca-1.0.zip"

  ebird_data_id <- contentid::register(ebird_data_location)
  ebird_data_id
}

prepare_ebird_2019_id <- function() {
  ebird_data_location <- "http://ebirddata.ornith.cornell.edu/downloads/gbiff/dwca-1.0.zip"
  # unfortunately, the eBird URL no longer work, but,
  # using a time machine, we went back in time and republished data via Zenodo from 2019
  ebird_data_location <- "https://zenodo.org/record/3858443/files/dwca-1.0.zip"

  ebird_data_id <- contentid::register(ebird_data_location)
  ebird_data_id
}

count_geese <- function(read_con) {
  open(read_con, 'rb')
  line_batch_size <- 1000
  geese_count <- 0
  total_count <- 0
  has_lines <- TRUE
  while (has_lines) {
    lines_read <- readLines(con = read_con, n = line_batch_size, skipNul = TRUE)
    has_lines <- length(lines_read) > 0
    geese_count_increment <- length(lines_read[grepl("Branta canadensis", lines_read)])
    geese_count <- geese_count + geese_count_increment
    total_count <- total_count + length(lines_read)
    if (total_count %% (line_batch_size * 1000) == 0) {
      cat(".")
    }
  }
  close(read_con)
  cat("done\n")
  list(with_geese=geese_count, total=total_count)
}

# complicated algorithm that counts lines
# with occurrences of Canadian Geese (Branta canadensis)
# using R's unz (see also https://stat.ethz.ch/R-manual/R-devel/library/base/html/connections.html)
#
# note that this fails for eBird, because "unz" cuts off the stream
# after 4GB (https://stackoverflow.com/questions/42740206/r-possible-truncation-of-4gb-file)
#
count_geese_unz <- function(data_location) {
  read_con <- unz(data_location, "occurrence.txt")
  count_geese(read_con)
}

# algorithm that counts lines
# with occurrences of Canadian Geese (Branta canadensis)
# using linux unzip (see also https://linux.die.net/man/1/unzip)

count_geese_linux_unzip <- function(data_location) {
  read_con <- pipe(paste("unzip -p", data_location, "occurrence.txt"))
  count_geese(read_con)
}

do_research <- function(data_id, algorithm) {
  # resolve the data_id to current data locations
  data_location <- contentid::resolve(data_id)
  algorithm(data_location)
}

reproduce_results <- function(algorithm) {
  ebird_2018 <- do_research("hash://sha256/29d30b566f924355a383b13cd48c3aa239d42cba0a55f4ccfc2930289b88b43c", algorithm = algorithm)

  # assumption from:
  # unzip -p ec3ff57cb48d5c41b77b5d1075738b40f598a900e8be56e7645e5a24013dffc4.dir/dwca-1.0.zip occurrence.txt | grep "Branta canadensis" | wc -l
  if (geese_count_ebird_2018$with_geese != 4613847) {
    message("failed to reproduce geese count of eBird 2018")
  }
  if (ebird_2018$total != 361429889) {
    message("the number of lines in eBird 2018 do not add up")
  }

  ebird_2019 <- do_research("hash://sha256/ec3ff57cb48d5c41b77b5d1075738b40f598a900e8be56e7645e5a24013dffc4", algorithm = algorithm)
  # assumption from:
  # unzip -p ec3ff57cb48d5c41b77b5d1075738b40f598a900e8be56e7645e5a24013dffc4.dir/dwca-1.0.zip occurrence.txt | grep "Branta canadensis" | wc -l
  if (ebird_2019$with_geese != 7066919) {
    message("failed to reproduce geese count of eBird 2019")
  }

  if (ebird_2019$total != 561852543) {
    message("the number of lines in eBird 2019 do not add up")
  }
}

reproduce_all <- function() {
  reproduce_results(algorithm = count_geese_unz)
  reproduce_results(algorithm = count_geese_linux_unzip)
}

## outline.md

      
    Raw
  

              outline.md
            
          
    Intro:
I1. URL specify a location, not content.
I2. You can't "download" a DOI
I3. content hashes are unique content identifiers for data
I4. idea: use content ids in analysis, register known locations of associated content in registries
Use case:
Reliably Counting Canadian Geese (the most prominent bird near Lake Merritt)
for (pseudo-)code see: https://gist.github.com/jhpoelen/19aba7c7c57d6da217ca644dc7634c02#file-count_geese-r
U1. discover eBird datasets (oops, they are all gone one behind some registration-wall)
U2. use a time machine (preston) to recover, republish and register eBird 2018/2019 pubs (see versions http://doi.org/10.5281/zenodo.3858250 ) using content::registry(...)
U3. develop your method (function count_geese)
U4. reproduce count using content_ids + contentid::resolve(...)
Conclusions:
C1. future proof your scripts by using content-based identifiers in your scripts instead of urls of local paths
C2. register locations of content ids anytime and anywhere (with or without time machine, suitable for embargoed/private data)
C3. use content ids!

	prepare_ebird_2018_id <- function() {
	ebird_data_location <- "http://ebirddata.ornith.cornell.edu/downloads/gbiff/dwca-1.0.zip"
	# unfortunately, the eBird URL no longer work, but,
	# using a time machine, we went back in time and republished data via Zenodo from 2018
	ebird_data_location <- "https://zenodo.org/record/3858251/files/dwca-1.0.zip"

	ebird_data_id <- contentid::register(ebird_data_location)
	ebird_data_id
	}

	prepare_ebird_2019_id <- function() {
	ebird_data_location <- "http://ebirddata.ornith.cornell.edu/downloads/gbiff/dwca-1.0.zip"
	# unfortunately, the eBird URL no longer work, but,
	# using a time machine, we went back in time and republished data via Zenodo from 2019
	ebird_data_location <- "https://zenodo.org/record/3858443/files/dwca-1.0.zip"

	ebird_data_id <- contentid::register(ebird_data_location)
	ebird_data_id
	}

	count_geese <- function(read_con) {
	open(read_con, 'rb')
	line_batch_size <- 1000
	geese_count <- 0
	total_count <- 0
	has_lines <- TRUE
	while (has_lines) {
	lines_read <- readLines(con = read_con, n = line_batch_size, skipNul = TRUE)
	has_lines <- length(lines_read) > 0
	geese_count_increment <- length(lines_read[grepl("Branta canadensis", lines_read)])
	geese_count <- geese_count + geese_count_increment
	total_count <- total_count + length(lines_read)
	if (total_count %% (line_batch_size * 1000) == 0) {
	cat(".")
	}
	}
	close(read_con)
	cat("done\n")
	list(with_geese=geese_count, total=total_count)
	}

	# complicated algorithm that counts lines
	# with occurrences of Canadian Geese (Branta canadensis)
	# using R's unz (see also https://stat.ethz.ch/R-manual/R-devel/library/base/html/connections.html)
	#
	# note that this fails for eBird, because "unz" cuts off the stream
	# after 4GB (https://stackoverflow.com/questions/42740206/r-possible-truncation-of-4gb-file)
	#
	count_geese_unz <- function(data_location) {
	read_con <- unz(data_location, "occurrence.txt")
	count_geese(read_con)
	}

	# algorithm that counts lines
	# with occurrences of Canadian Geese (Branta canadensis)
	# using linux unzip (see also https://linux.die.net/man/1/unzip)

	count_geese_linux_unzip <- function(data_location) {
	read_con <- pipe(paste("unzip -p", data_location, "occurrence.txt"))
	count_geese(read_con)
	}

	do_research <- function(data_id, algorithm) {
	# resolve the data_id to current data locations
	data_location <- contentid::resolve(data_id)
	algorithm(data_location)
	}

	reproduce_results <- function(algorithm) {
	ebird_2018 <- do_research("hash://sha256/29d30b566f924355a383b13cd48c3aa239d42cba0a55f4ccfc2930289b88b43c", algorithm = algorithm)

	# assumption from:
	# unzip -p ec3ff57cb48d5c41b77b5d1075738b40f598a900e8be56e7645e5a24013dffc4.dir/dwca-1.0.zip occurrence.txt \| grep "Branta canadensis" \| wc -l
	if (geese_count_ebird_2018$with_geese != 4613847) {
	message("failed to reproduce geese count of eBird 2018")
	}
	if (ebird_2018$total != 361429889) {
	message("the number of lines in eBird 2018 do not add up")
	}

	ebird_2019 <- do_research("hash://sha256/ec3ff57cb48d5c41b77b5d1075738b40f598a900e8be56e7645e5a24013dffc4", algorithm = algorithm)
	# assumption from:
	# unzip -p ec3ff57cb48d5c41b77b5d1075738b40f598a900e8be56e7645e5a24013dffc4.dir/dwca-1.0.zip occurrence.txt \| grep "Branta canadensis" \| wc -l
	if (ebird_2019$with_geese != 7066919) {
	message("failed to reproduce geese count of eBird 2019")
	}

	if (ebird_2019$total != 561852543) {
	message("the number of lines in eBird 2019 do not add up")
	}
	}

	reproduce_all <- function() {
	reproduce_results(algorithm = count_geese_unz)
	reproduce_results(algorithm = count_geese_linux_unzip)
	}