Skip to content

Instantly share code, notes, and snippets.

@raymondben
Last active September 14, 2023 03:23
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save raymondben/a6eb43b30e1194e40068 to your computer and use it in GitHub Desktop.
Save raymondben/a6eb43b30e1194e40068 to your computer and use it in GitHub Desktop.
Recreating regional soundscapes from GBIF data
## Many occurrences have associated media: sound, images, or video. GBIF provides access to these media files through its portal as well as the API. Here, we reconstruct the "soundscapes" of particular regions by compiling the bird and frog sounds from those regions.
## Ben Raymond, Peter Neish
## Last-Modified: <2015-09-04 10:40:47>
## change these paths to suit your needs
setwd("~/your/path/")
cache_directory="~/your/data/cache/path/"
## required packages
require(rgbif)
require(plyr)
require(R.cache)
require(sampling)
require(jsonlite)
## We use the `R.cache` package to cache our web calls to the GBIF API. This just means that, having run a certain query once, we can re-run it without needing an internet connection. Specify the cache directory and create a cached version of the `occ_search` and `name_usage` functions:
setCacheRootPath(cache_directory)
cached_occ_search=addMemoization(occ_search)
cached_name_usage=addMemoization(name_usage)
## Specify the taxon keys we are interested in: 212 (Aves; birds) and 952 (Anura; frogs and toads)
parentKeys=c(212,952)
## Our aim is to find the sounds associated with the taxa present in a region of interest.
## Ideally we could just search for occurrences in that region that also have media linked to the occurrence records.
## Unfortunately, though, most occurrence records *do not* have associated sound media.
## Our strategy instead is to find out which taxa are present in our region of interest, and then find sound files for those same taxa (regardless of location).
## These media will almost certainly have been recorded at locations other than our region of interest, but we assume that a given species makes the same sounds worldwide --- this might not necessarily be the case for all species!
## There aren't very many occurrences of birds and frogs with sound media, so the easiest thing is just to grab the lot and filter them later according to other criteria.
temp=cached_occ_search(taxonKey=parentKeys,mediatype='Sound',limit=30000)
## reformat this for convenience into a single data frame
all_sounds=list(data=ldply(temp,function(z)z$data))
all_sounds$media=unlist(sapply(temp,function(z)z$media),recursive=FALSE)
## define a number of regions of interest
all_regions=data.frame(region="Macquarie Island",lon_min=157.9,lon_max=159.9,lat_min=-55.1,lat_max=-54.1,stringsAsFactors=FALSE)
all_regions=rbind(all_regions,data.frame(region="Bavaria",lon_min=10.3,lon_max=13.6,lat_min=47.6,lat_max=50.6))
all_regions=rbind(all_regions,data.frame(region="Bavaria (summer)",lon_min=10.3,lon_max=13.6,lat_min=47.6,lat_max=50.6))
all_regions=rbind(all_regions,data.frame(region="Bavaria (winter)",lon_min=10.3,lon_max=13.6,lat_min=47.6,lat_max=50.6))
all_regions=rbind(all_regions,data.frame(region="Costa Rica",lon_min=-85.7,lon_max=-83,lat_min=8.3,lat_max=11.0))
all_regions=rbind(all_regions,data.frame(region="Ecuador (Podocarpus National Park)",lon_min=-79.1,lon_max=-78.8,lat_min=-4.5,lat_max=-4.0))
all_regions=rbind(all_regions,data.frame(region="Scotland (Loch Lomond)",lon_min=-4.8,lon_max=-4.2,lat_min=56,lat_max=56.5))
all_regions=rbind(all_regions,data.frame(region="South Africa (Kruger National Park)",lon_min=31,lon_max=32,lat_min=-25.4,lat_max=-22.4))
all_regions=rbind(all_regions,data.frame(region="Spain (Doñana National Park)",lon_min=-6.6,lon_max=-6.2,lat_min=36.8,lat_max=37.2))
all_regions=rbind(all_regions,data.frame(region="Sri Lanka",lon_min=79.4,lon_max=82.1,lat_min=5.8,lat_max=10))
all_regions=rbind(all_regions,data.frame(region="USA (Death Valley)",lon_min=-117.6,lon_max=-116.3,lat_min=35.6,lat_max=37.0))
all_regions=rbind(all_regions,data.frame(region="Adelaide Hills",lon_min=138.6,lon_max=138.9,lat_min=-35,lat_max=-34.9))
all_regions=rbind(all_regions,data.frame(region="Yukon Delta National Wildlife Refuge",lon_min=-167.4,lon_max=-160.2,lat_min=59.7,lat_max=63.5))
all_regions=rbind(all_regions,data.frame(region="Philippines",lon_min=118.7,lon_max=126.8,lat_min=6.6,lat_max=18.7))
all_regions=rbind(all_regions,data.frame(region="Sea of Okhotsk",lon_min=140.6,lon_max=156.6,lat_min=49.2,lat_max=59.3))
all_regions$region=as.character(all_regions$region)
## loop through each region in turn
for (roi in 1:nrow(all_regions)) {
this_region=all_regions$region[roi]
cat(sprintf("%s ...\n",this_region))
this_lon=with(all_regions[roi,],c(lon_min,lon_max))
this_lat=with(all_regions[roi,],c(lat_min,lat_max))
## Grab occurrence records (up to 5000 each of birds and frogs) in our region and find the list of unique taxa
## It would be nice to be able to request just the list of taxa that meet certain search criteria but the GBIF API does not support this.
## There is a risk that we won't get all the taxa present doing it this way (unless we download *all* matching occurrences, which seems excessive)
do_by_season=grepl("(winter|summer)",this_region)
if (!do_by_season) {
this=ldply(cached_occ_search(taxonKey=parentKeys,decimalLongitude=paste(this_lon,collapse=","),decimalLatitude=paste(this_lat,collapse=","),limit=5000,return="data"),function(z)if (is.data.frame(z)) z else data.frame())
} else {
if (grepl("winter",this_region)) {
## assume northern-hemisphere seasons
months_of_interest=c(12,1,2)
} else {
months_of_interest=c(6,7,8)
}
this=ldply(months_of_interest,function(z) ldply(cached_occ_search(taxonKey=parentKeys,decimalLongitude=paste(this_lon,collapse=","),decimalLatitude=paste(this_lat,collapse=","),month=z,limit=1500,return="data"),function(z)if (is.data.frame(z)) z else data.frame()))
## can check distribution of records by month and taxonomic class with
## table(this[,c("month","class")])
}
## Now we intersect the list of taxa present in our region with those for which we have sound media
taxa_with_media=intersect(this$taxonKey,all_sounds$data$taxonKey)
this_data=subset(all_sounds$data,taxonKey %in% taxa_with_media)
temp=all_sounds$media[all_sounds$data$taxonKey %in% taxa_with_media]
this_data$audio_url=sapply(temp,function(z)as.character(z[[1]][[1]]$identifier))
this_data$audio_reference=sapply(temp,function(z){ zz=z[[1]][[1]]$references; if (is.null(zz)) "" else zz })
cat(sprintf("Found %d media items for %d species in parentKeys %s and region %s\n",nrow(this_data),length(unique(this_data$taxonKey)),paste(parentKeys,collapse=","),this_region))
## filter out some files that we don't want to use:
## many of the files from http://web.corral.tacc.utexas.edu have human voices audible, so avoid these
idx=!grepl("tacc\\.utexas",this_data$audio_url)
this_data=subset(this_data,idx)
## prions apparently make horrible noises that rather detract from the final result, so avoid these too
this_data=subset(this_data,!grepl("Pachyptila",this_data$name))
## select one media item per taxon
this_data=arrange(this_data,taxonKey)
this_data=this_data[strata(this_data,"taxonKey",rep(1,length(unique(this_data$taxonKey))),"srswor")$ID_unit,]
## retrieve photos for each taxon
temp2=cached_occ_search(taxonKey=this_data$taxonKey,mediatype = 'StillImage',return = "media", limit=1) ## max of one per taxon
temp2=ldply(temp2,function(z)if(is.list(z)) as.data.frame(z[[1]][[1]],stringsAsFactors=FALSE) else data.frame())
temp2=subset(temp2,select=c(".id","identifier","references"))
names(temp2)=c("taxonKey","image_url","image_reference")
temp2$image_reference[is.na(temp2$image_reference)]=""
this_data=merge(this_data,temp2,by="taxonKey",all.x=TRUE)
this_data=arrange(this_data,class,name) ## order by name within class
## add vernacular names
this_data$vernacularName=""
for (k in 1:nrow(this_data)) {
temp=cached_name_usage(key=this_data$taxonKey[k],data="vernacularNames")
if (!is.null(temp$data)) {
counts=table(subset(temp$data,language=="eng")$vernacularName)
if (length(counts)>0) {
this_data$vernacularName[k]=names(which.max(counts))
}
}
}
## write json file
outfile=paste0(gsub(" ","_",this_region),".json")
out_content=list(locality=this_region,lon=this_lon,lat=this_lat)
temp=subset(this_data,class=="Aves")
out_content$birds=list()
if (nrow(temp)>0) {
for (k in 1:nrow(temp)) {
out_content$birds[[length(out_content$birds)+1]]=list(name=temp$name[k],vernacularName=temp$vernacularName[k],audio=temp$audio_url[k],audio_reference=temp$audio_reference[k],image=temp$image_url[k],image_reference=temp$image_reference[k])
}
}
temp=subset(this_data,class=="Amphibia")
out_content$frogs=list()
if (nrow(temp)>0) {
for (k in 1:nrow(temp)) {
out_content$frogs[[length(out_content$frogs)+1]]=list(name=temp$name[k],vernacularName=temp$vernacularName[k],audio=temp$audio_url[k],audio_reference=temp$audio_reference[k],image=temp$image_url[k],image_reference=temp$image_reference[k])
}
}
json=toJSON(out_content,auto_unbox=TRUE)
write(json,file=outfile,append=FALSE)
## optionally write html file just for a quick test
if (FALSE) {
outfile=paste0(gsub(" ","_",this_region),".html")
write("<!DOCTYPE html>\n<html>\n<body>\n",file=outfile,append=FALSE)
write("<script>function play_all() { var a=document.getElementsByClassName(\"noisy\"); for (var i=0;i<a.length;i++) a[i].play(); }</script>",file=outfile,append=TRUE)
write("<script>function pause_all() { var a=document.getElementsByClassName(\"noisy\"); for (var i=0;i<a.length;i++) a[i].pause(); }</script>",file=outfile,append=TRUE)
write("<button type=\"button\" onclick=\"play_all()\">Play all</button> <button type=\"button\" onclick=\"pause_all()\">Pause all</button><br />",file=outfile,append=TRUE)
last_class=""
for (k in 1:nrow(this_data)) {
if (! last_class==this_data$class[k]) {
if (! last_class=="") write("</p>",file=outfile,append=TRUE)
write(sprintf("<p><strong>%s</strong></p>",this_data$class[k]),file=outfile,append=TRUE)
}
last_class=this_data$class[k]
write(sprintf("<p>%s <em>%s</em> ",this_data$vernacularName[k],this_data$name[k]),file=outfile,append=TRUE)
if (FALSE) {##(!is.na(this_data$image[k])) {
write(sprintf("<img width=\"100px\" src=\"%s\" />",this_data$image[k]),file=outfile,append=TRUE)
}
write(sprintf("<audio class=\"noisy\" controls loop autoplay><source src=\"%s\" type=\"audio/mpeg\">Your browser does not support the audio element.</audio></p>",this_data$audio[k]),file=outfile,append=TRUE)
}
write("</body>\n</html>",file=outfile,append=TRUE)
##browseURL(outfile)
}
} ## end looping regions
## For the round 2 entry, we used a slightly different data file format.
## Combine and reformat the JSON files now
x=data.frame()
ff=dir(".",pattern = "^[A-Z].*.json$")
locs=list()
for (i in 1:length(ff)) {
thisx=fromJSON(ff[i])
this_region=thisx$locality
this_season=""
if (grepl("winter",this_region)) this_season="winter"
if (grepl("summer",this_region)) this_season="summer"
this_region_bare=sub(" \\((winter|summer)\\)","",this_region)
if (length(thisx$birds)>0) {
thisx$birds$region=this_region
thisx$birds$season=this_season
thisx$birds$type="bird"
} else {
thisx$birds=data.frame()
}
if (length(thisx$frogs)>0) {
thisx$frogs$region=this_region
thisx$frogs$season=this_season
thisx$frogs$type="frog"
} else {
thisx$frogs=data.frame()
}
this_data=rbind(thisx$birds,thisx$frogs)
if (i==1) {
x=this_data
} else {
x=rbind(x,this_data)
}
if (!any(sapply(locs,function(z)z$name==this_region_bare))) {
locs[[length(locs)+1]]=list(name=this_region_bare,bounds=list(c(thisx$lat[1],thisx$lon[1]),c(thisx$lat[2],thisx$lon[2])))
}
}
## Now consolidate to one entry per taxon
## JSON formatting is a bit tedious here, to ensure proper array/string formatting (see jsonlite::unbox)
nu=unique(x$name)
jx=list()
for (ni in 1:length(nu)) {
idx=x$name==nu[ni]
this_region=sort(unique(x$region[idx]))
this_season=rep("",length(this_region))
this_season[grepl("winter",this_region)]="winter"
this_season[grepl("summer",this_region)]="summer"
this_region=sub(" \\((winter|summer)\\)","",this_region)
temp=as.list(x[which(idx)[1],])
temp$region=this_region
temp$season=this_season
## convert to JSON now so we can control unboxing
jx[[ni]]=toJSON(list(name=unbox(temp$name),vernacularName=unbox(temp$vernacularName),audio=unbox(temp$audio),audio_reference=unbox(temp$audio_reference),image=unbox(temp$image),image_reference=unbox(temp$image_reference),type=unbox(temp$type),region=temp$region,season=temp$season))
}
json=toJSON(list(localities=locs),auto_unbox=TRUE)
json=substr(json,1,nchar(json)-1) ## drop trailing }
jx=paste(jx,collapse=",")
json=paste0(json,",\"taxa\":[",jx,"]}")
write(json,file="soundscape2.json",append=FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment