Skip to content

Instantly share code, notes, and snippets.

@kgturner
Created March 27, 2019 17:08
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kgturner/f44a9dc6e3417794ed1f433a96a2cc7a to your computer and use it in GitHub Desktop.
Save kgturner/f44a9dc6e3417794ed1f433a96a2cc7a to your computer and use it in GitHub Desktop.
Get large amounts of GBIF.org occurence data
#Download occurence data from GBIF - for few to many spp.
#3/25/2019 KG Turner with assistance from S. Chamberlain, rOpenSci.org
#Register at gbif.org. You will need to use this email address, user name, and password in the following script.
#NB: DON'T COMMIT YOUR PASSWORDS.
# R version 3.5.3 (2019-03-11)
library(rgbif) #1.2.0
####make spp. list####
smallList <- c("Chorispora tenella", "Centaurea diffusa", "Lupinus texensis")
#to get synonym keys
smallList_key <- sapply(smallList, function(x) name_suggest(x)$key[1], USE.NAMES=FALSE)
####GBIF request for a single list (at most 100 spp)####
#you will need to register with GBIF to get username etc.
occ_download(paste0("taxonKey = ", paste0(smallList_key, collapse = ",")),
"basisOfRecord = PRESERVED_SPECIMEN,LITERATURE",
"hasCoordinate = true",
"hasGeospatialIssue = false",
user = "***",
pwd = "***",
email = "***"
)
###for 3 species testlist:
# <<gbif download>>
# Username: ***
# E-mail: ***
# Download key: 0002607-190320150433242
#check status of request
occ_download_meta(key="0003753-190320150433242")
# <<gbif download metadata>>
# Status: PREPARING
# Format: DWCA
# Download key: 0002607-190320150433242
# Created: 2019-03-25T20:15:22.624+0000
# Modified: 2019-03-25T20:15:43.495+0000
# Download link: http://api.gbif.org/v1/occurrence/download/request/0002607-190320150433242.zip
# Total records: 1706
# Request:
# type: and
# predicates:
# > type: or
# predicates:
# - type: equals, key: TAXON_KEY, value: 3044349
# - type: equals, key: TAXON_KEY, value: 3128962
# - type: equals, key: TAXON_KEY, value: 2963880
# > type: or
# predicates:
# - type: equals, key: BASIS_OF_RECORD, value: PRESERVED_SPECIMEN
# - type: equals, key: BASIS_OF_RECORD, value: LITERATURE
# > type: equals, key: HAS_COORDINATE, value: true
# > type: equals, key: HAS_GEOSPATIAL_ISSUE, value: false
####For larger numbers of species...####
#break up long species key lists into ~100 spp. lists
#GBIF restrictions: URL call limit 12K characters. Limit 3 requests at at time.
####large vector splitting function
library(plyr) #1.8.4
plyrChunks <- function(d, n){
is <- seq(from = 1, to = length(d), by = n)
if(tail(is, 1) != length(d)) {
is <- c(is, length(d))
}
chunks <- llply(head(seq_along(is), -1),
function(i){
start <- is[i];
end <- is[i+1]-1;
d[start:end]})
lc <- length(chunks)
td <- tail(d, 1)
chunks[[lc]] <- c(chunks[[lc]], td)
return(chunks)
}
#plyrChunks(d = vector, n = size_of_chunks)
d <- 1:2030
n <- 100
chunkList <- plyrChunks(d,n) #gives list of smaller vectors
#break key vector of more than 300 spp into list of smaller vector chunks
chunkList <- plyrChunks(bigList_key,100) #make ~100 spp. sub-lists for a really long species list
####GBIF request queing function for >300 spp.####
#for less than 300 spp, faster to call them individually as above
#GBIF will only accept three request from a single user at a time.
#This function ques your requests and submits #4 when one of #1-3 are done.
#For 400 - 500 spp., specify occ_download() calls within the occ_download_queue() call
output <- occ_download_queue(
occ_download('taxonKey = 3119195', "year = 1976",
user = "***",
pwd = "***",
email = "***"),
occ_download('taxonKey = 3119195', "year = 2001", "month <= 8",
user = "***",
pwd = "***",
email = "***"),
occ_download("country = NZ", "year = 1999", "month = 3",
user = "***",
pwd = "***",
email = "***"),
occ_download("catalogNumber = Bird.27847588", "year = 1998", "month = 2",
user = "***",
pwd = "***",
email = "***")
)
#download data
lapply(output, occ_download_get)
####run gbif queueing for really large spp. list####
#loop through many occ_download() calls
#input name of list of short key vectors, i.e. chunkList above
for (i in chunkList[1:n]){
output <- occ_download_queue(
occ_download(paste0("taxonKey = ", paste0(i, collapse = ",")),
"basisOfRecord = PRESERVED_SPECIMEN,LITERATURE",
"hasCoordinate = true",
"hasGeospatialIssue = false",
user = "***",
pwd = "***",
email = "***"
))
print(output)
}
lapply(output, occ_download_meta)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment