jhnwllr/GBIF_processing_pipeline.R

## GBIF_processing_pipeline.R
library(dplyr)
library(readr)
library(rgbif)
library(taxize)
library(CoordinateCleaner)

# 4 Odonata scientific names
name_list =
c(
"Cordulegaster charpentieri Kolenati, 1846",
"Cordulegaster talaria Tennessen, 2004",
"Calopteryx splendens Harris, 1780",
"Epiophlebia laidlawi Tillyard, 1921"
)

# match names to GBIF taxonkeys
gbif_taxon_keys = name_list %>%
taxize::get_gbifid_(method="backbone") %>% # get the gbif taxonkey
bind_rows() %>%
filter(matchtype == "EXACT" & status == "ACCEPTED") %>%
filter(order == "Odonata") %>% # remove anything that might have matched to a non-dragonfly
pull(usagekey) # get the GBIF taxonkeys

user="jwaller" # your GBIF user name
pwd="" # your GBIF password
email="jwaller@gbif.org" # your email

gbif_download_key = occ_download(
type="and",
pred_in("taxonKey", gbif_taxon_keys),
pred("hasGeospatialIssue", FALSE),
pred("hasCoordinate", TRUE),
format = "SIMPLE_CSV",
user=user,pwd=pwd,email=email
)

# <<gbif download>>
  # Username: jwaller
  # E-mail: jwaller@gbif.org
  # Format: SIMPLE_CSV
  # Download key: 0253330-200613084148143


## Wait 10-15 min
## Need to wait for download to finish to run next part

gbif_download_key = "0253330-200613084148143"
path_to_download = "C:/Users/ftw712/Desktop/"

# download the file to your machine
rgbif::occ_download_get(gbif_download_key, path = path_to_download, overwrite = FALSE)
# Sometimes easier to just get download from GBIF user profile
# https://www.gbif.org/user/download

# can do this "manually" just want script to run without stopping
zip_file = paste0(path_to_download,gbif_download_key,".zip")
extract_dir = paste0(path_to_download,gbif_download_key)
unzip(zip_file,exdir=extract_dir)

# read in download. Recommend data.table::fread() to avoid parsing errors sometimes that happen sometimes with other csv readers
gbif_download = data.table::fread(paste0(path_to_download,gbif_download_key,"/",gbif_download_key,".csv")) %>%
glimpse()

# Post processing GBIF download
gbif_clean_data = gbif_download %>%
setNames(tolower(names(.))) %>% # set lowercase column names to work with CoordinateCleaner
filter(occurrencestatus  == "PRESENT") %>%
filter(!is.na(decimallongitude)) %>%
filter(!is.na(decimallatitude)) %>%
filter(!basisofrecord %in% c("FOSSIL_SPECIMEN","LIVING_SPECIMEN")) %>%
filter(!establishmentmeans %in% c("MANAGED", "INTRODUCED", "INVASIVE", "NATURALISED")) %>%
filter(year >= 1900) %>%
filter(coordinateprecision > 0.01 | is.na(coordinateprecision)) %>%
filter(coordinateuncertaintyinmeters < 10000 | is.na(coordinateuncertaintyinmeters)) %>%
filter(!coordinateuncertaintyinmeters %in% c(301,3036,999,9999)) %>%
filter(!decimallatitude == 0 | !decimallongitude == 0) %>%
cc_cen(buffer = 2000) %>% # remove country centroids within 2km
cc_cap(buffer = 2000) %>% # remove capitals centroids within 2km
cc_inst(buffer = 2000) %>% # remove zoo and herbaria within 2km
cc_sea() %>% # remove from ocean
distinct(decimallongitude,decimallatitude,specieskey,datasetkey, .keep_all = TRUE) %>% # this removes a lot of records!
glimpse() # look at results of pipeline

# 168,593 # before cleaning
#  54,938 # after
	library(dplyr)
	library(readr)
	library(rgbif)
	library(taxize)
	library(CoordinateCleaner)

	# 4 Odonata scientific names
	name_list =
	c(
	"Cordulegaster charpentieri Kolenati, 1846",
	"Cordulegaster talaria Tennessen, 2004",
	"Calopteryx splendens Harris, 1780",
	"Epiophlebia laidlawi Tillyard, 1921"
	)

	# match names to GBIF taxonkeys
	gbif_taxon_keys = name_list %>%
	taxize::get_gbifid_(method="backbone") %>% # get the gbif taxonkey
	bind_rows() %>%
	filter(matchtype == "EXACT" & status == "ACCEPTED") %>%
	filter(order == "Odonata") %>% # remove anything that might have matched to a non-dragonfly
	pull(usagekey) # get the GBIF taxonkeys

	user="jwaller" # your GBIF user name
	pwd="" # your GBIF password
	email="jwaller@gbif.org" # your email

	gbif_download_key = occ_download(
	type="and",
	pred_in("taxonKey", gbif_taxon_keys),
	pred("hasGeospatialIssue", FALSE),
	pred("hasCoordinate", TRUE),
	format = "SIMPLE_CSV",
	user=user,pwd=pwd,email=email
	)

	# <<gbif download>>
	# Username: jwaller
	# E-mail: jwaller@gbif.org
	# Format: SIMPLE_CSV
	# Download key: 0253330-200613084148143


	## Wait 10-15 min
	## Need to wait for download to finish to run next part

	gbif_download_key = "0253330-200613084148143"
	path_to_download = "C:/Users/ftw712/Desktop/"

	# download the file to your machine
	rgbif::occ_download_get(gbif_download_key, path = path_to_download, overwrite = FALSE)
	# Sometimes easier to just get download from GBIF user profile
	# https://www.gbif.org/user/download

	# can do this "manually" just want script to run without stopping
	zip_file = paste0(path_to_download,gbif_download_key,".zip")
	extract_dir = paste0(path_to_download,gbif_download_key)
	unzip(zip_file,exdir=extract_dir)

	# read in download. Recommend data.table::fread() to avoid parsing errors sometimes that happen sometimes with other csv readers
	gbif_download = data.table::fread(paste0(path_to_download,gbif_download_key,"/",gbif_download_key,".csv")) %>%
	glimpse()

	# Post processing GBIF download
	gbif_clean_data = gbif_download %>%
	setNames(tolower(names(.))) %>% # set lowercase column names to work with CoordinateCleaner
	filter(occurrencestatus == "PRESENT") %>%
	filter(!is.na(decimallongitude)) %>%
	filter(!is.na(decimallatitude)) %>%
	filter(!basisofrecord %in% c("FOSSIL_SPECIMEN","LIVING_SPECIMEN")) %>%
	filter(!establishmentmeans %in% c("MANAGED", "INTRODUCED", "INVASIVE", "NATURALISED")) %>%
	filter(year >= 1900) %>%
	filter(coordinateprecision > 0.01 \| is.na(coordinateprecision)) %>%
	filter(coordinateuncertaintyinmeters < 10000 \| is.na(coordinateuncertaintyinmeters)) %>%
	filter(!coordinateuncertaintyinmeters %in% c(301,3036,999,9999)) %>%
	filter(!decimallatitude == 0 \| !decimallongitude == 0) %>%
	cc_cen(buffer = 2000) %>% # remove country centroids within 2km
	cc_cap(buffer = 2000) %>% # remove capitals centroids within 2km
	cc_inst(buffer = 2000) %>% # remove zoo and herbaria within 2km
	cc_sea() %>% # remove from ocean
	distinct(decimallongitude,decimallatitude,specieskey,datasetkey, .keep_all = TRUE) %>% # this removes a lot of records!
	glimpse() # look at results of pipeline

	# 168,593 # before cleaning
	# 54,938 # after