-
-
Save raymondben/a6eb43b30e1194e40068 to your computer and use it in GitHub Desktop.
Recreating regional soundscapes from GBIF data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Many occurrences have associated media: sound, images, or video. GBIF provides access to these media files through its portal as well as the API. Here, we reconstruct the "soundscapes" of particular regions by compiling the bird and frog sounds from those regions. | |
## Ben Raymond, Peter Neish | |
## Last-Modified: <2015-09-04 10:40:47> | |
## change these paths to suit your needs | |
setwd("~/your/path/") | |
cache_directory="~/your/data/cache/path/" | |
## required packages | |
require(rgbif) | |
require(plyr) | |
require(R.cache) | |
require(sampling) | |
require(jsonlite) | |
## We use the `R.cache` package to cache our web calls to the GBIF API. This just means that, having run a certain query once, we can re-run it without needing an internet connection. Specify the cache directory and create a cached version of the `occ_search` and `name_usage` functions: | |
setCacheRootPath(cache_directory) | |
cached_occ_search=addMemoization(occ_search) | |
cached_name_usage=addMemoization(name_usage) | |
## Specify the taxon keys we are interested in: 212 (Aves; birds) and 952 (Anura; frogs and toads) | |
parentKeys=c(212,952) | |
## Our aim is to find the sounds associated with the taxa present in a region of interest. | |
## Ideally we could just search for occurrences in that region that also have media linked to the occurrence records. | |
## Unfortunately, though, most occurrence records *do not* have associated sound media. | |
## Our strategy instead is to find out which taxa are present in our region of interest, and then find sound files for those same taxa (regardless of location). | |
## These media will almost certainly have been recorded at locations other than our region of interest, but we assume that a given species makes the same sounds worldwide --- this might not necessarily be the case for all species! | |
## There aren't very many occurrences of birds and frogs with sound media, so the easiest thing is just to grab the lot and filter them later according to other criteria. | |
temp=cached_occ_search(taxonKey=parentKeys,mediatype='Sound',limit=30000) | |
## reformat this for convenience into a single data frame | |
all_sounds=list(data=ldply(temp,function(z)z$data)) | |
all_sounds$media=unlist(sapply(temp,function(z)z$media),recursive=FALSE) | |
## define a number of regions of interest | |
all_regions=data.frame(region="Macquarie Island",lon_min=157.9,lon_max=159.9,lat_min=-55.1,lat_max=-54.1,stringsAsFactors=FALSE) | |
all_regions=rbind(all_regions,data.frame(region="Bavaria",lon_min=10.3,lon_max=13.6,lat_min=47.6,lat_max=50.6)) | |
all_regions=rbind(all_regions,data.frame(region="Bavaria (summer)",lon_min=10.3,lon_max=13.6,lat_min=47.6,lat_max=50.6)) | |
all_regions=rbind(all_regions,data.frame(region="Bavaria (winter)",lon_min=10.3,lon_max=13.6,lat_min=47.6,lat_max=50.6)) | |
all_regions=rbind(all_regions,data.frame(region="Costa Rica",lon_min=-85.7,lon_max=-83,lat_min=8.3,lat_max=11.0)) | |
all_regions=rbind(all_regions,data.frame(region="Ecuador (Podocarpus National Park)",lon_min=-79.1,lon_max=-78.8,lat_min=-4.5,lat_max=-4.0)) | |
all_regions=rbind(all_regions,data.frame(region="Scotland (Loch Lomond)",lon_min=-4.8,lon_max=-4.2,lat_min=56,lat_max=56.5)) | |
all_regions=rbind(all_regions,data.frame(region="South Africa (Kruger National Park)",lon_min=31,lon_max=32,lat_min=-25.4,lat_max=-22.4)) | |
all_regions=rbind(all_regions,data.frame(region="Spain (Doñana National Park)",lon_min=-6.6,lon_max=-6.2,lat_min=36.8,lat_max=37.2)) | |
all_regions=rbind(all_regions,data.frame(region="Sri Lanka",lon_min=79.4,lon_max=82.1,lat_min=5.8,lat_max=10)) | |
all_regions=rbind(all_regions,data.frame(region="USA (Death Valley)",lon_min=-117.6,lon_max=-116.3,lat_min=35.6,lat_max=37.0)) | |
all_regions=rbind(all_regions,data.frame(region="Adelaide Hills",lon_min=138.6,lon_max=138.9,lat_min=-35,lat_max=-34.9)) | |
all_regions=rbind(all_regions,data.frame(region="Yukon Delta National Wildlife Refuge",lon_min=-167.4,lon_max=-160.2,lat_min=59.7,lat_max=63.5)) | |
all_regions=rbind(all_regions,data.frame(region="Philippines",lon_min=118.7,lon_max=126.8,lat_min=6.6,lat_max=18.7)) | |
all_regions=rbind(all_regions,data.frame(region="Sea of Okhotsk",lon_min=140.6,lon_max=156.6,lat_min=49.2,lat_max=59.3)) | |
all_regions$region=as.character(all_regions$region) | |
## loop through each region in turn | |
for (roi in 1:nrow(all_regions)) { | |
this_region=all_regions$region[roi] | |
cat(sprintf("%s ...\n",this_region)) | |
this_lon=with(all_regions[roi,],c(lon_min,lon_max)) | |
this_lat=with(all_regions[roi,],c(lat_min,lat_max)) | |
## Grab occurrence records (up to 5000 each of birds and frogs) in our region and find the list of unique taxa | |
## It would be nice to be able to request just the list of taxa that meet certain search criteria but the GBIF API does not support this. | |
## There is a risk that we won't get all the taxa present doing it this way (unless we download *all* matching occurrences, which seems excessive) | |
do_by_season=grepl("(winter|summer)",this_region) | |
if (!do_by_season) { | |
this=ldply(cached_occ_search(taxonKey=parentKeys,decimalLongitude=paste(this_lon,collapse=","),decimalLatitude=paste(this_lat,collapse=","),limit=5000,return="data"),function(z)if (is.data.frame(z)) z else data.frame()) | |
} else { | |
if (grepl("winter",this_region)) { | |
## assume northern-hemisphere seasons | |
months_of_interest=c(12,1,2) | |
} else { | |
months_of_interest=c(6,7,8) | |
} | |
this=ldply(months_of_interest,function(z) ldply(cached_occ_search(taxonKey=parentKeys,decimalLongitude=paste(this_lon,collapse=","),decimalLatitude=paste(this_lat,collapse=","),month=z,limit=1500,return="data"),function(z)if (is.data.frame(z)) z else data.frame())) | |
## can check distribution of records by month and taxonomic class with | |
## table(this[,c("month","class")]) | |
} | |
## Now we intersect the list of taxa present in our region with those for which we have sound media | |
taxa_with_media=intersect(this$taxonKey,all_sounds$data$taxonKey) | |
this_data=subset(all_sounds$data,taxonKey %in% taxa_with_media) | |
temp=all_sounds$media[all_sounds$data$taxonKey %in% taxa_with_media] | |
this_data$audio_url=sapply(temp,function(z)as.character(z[[1]][[1]]$identifier)) | |
this_data$audio_reference=sapply(temp,function(z){ zz=z[[1]][[1]]$references; if (is.null(zz)) "" else zz }) | |
cat(sprintf("Found %d media items for %d species in parentKeys %s and region %s\n",nrow(this_data),length(unique(this_data$taxonKey)),paste(parentKeys,collapse=","),this_region)) | |
## filter out some files that we don't want to use: | |
## many of the files from http://web.corral.tacc.utexas.edu have human voices audible, so avoid these | |
idx=!grepl("tacc\\.utexas",this_data$audio_url) | |
this_data=subset(this_data,idx) | |
## prions apparently make horrible noises that rather detract from the final result, so avoid these too | |
this_data=subset(this_data,!grepl("Pachyptila",this_data$name)) | |
## select one media item per taxon | |
this_data=arrange(this_data,taxonKey) | |
this_data=this_data[strata(this_data,"taxonKey",rep(1,length(unique(this_data$taxonKey))),"srswor")$ID_unit,] | |
## retrieve photos for each taxon | |
temp2=cached_occ_search(taxonKey=this_data$taxonKey,mediatype = 'StillImage',return = "media", limit=1) ## max of one per taxon | |
temp2=ldply(temp2,function(z)if(is.list(z)) as.data.frame(z[[1]][[1]],stringsAsFactors=FALSE) else data.frame()) | |
temp2=subset(temp2,select=c(".id","identifier","references")) | |
names(temp2)=c("taxonKey","image_url","image_reference") | |
temp2$image_reference[is.na(temp2$image_reference)]="" | |
this_data=merge(this_data,temp2,by="taxonKey",all.x=TRUE) | |
this_data=arrange(this_data,class,name) ## order by name within class | |
## add vernacular names | |
this_data$vernacularName="" | |
for (k in 1:nrow(this_data)) { | |
temp=cached_name_usage(key=this_data$taxonKey[k],data="vernacularNames") | |
if (!is.null(temp$data)) { | |
counts=table(subset(temp$data,language=="eng")$vernacularName) | |
if (length(counts)>0) { | |
this_data$vernacularName[k]=names(which.max(counts)) | |
} | |
} | |
} | |
## write json file | |
outfile=paste0(gsub(" ","_",this_region),".json") | |
out_content=list(locality=this_region,lon=this_lon,lat=this_lat) | |
temp=subset(this_data,class=="Aves") | |
out_content$birds=list() | |
if (nrow(temp)>0) { | |
for (k in 1:nrow(temp)) { | |
out_content$birds[[length(out_content$birds)+1]]=list(name=temp$name[k],vernacularName=temp$vernacularName[k],audio=temp$audio_url[k],audio_reference=temp$audio_reference[k],image=temp$image_url[k],image_reference=temp$image_reference[k]) | |
} | |
} | |
temp=subset(this_data,class=="Amphibia") | |
out_content$frogs=list() | |
if (nrow(temp)>0) { | |
for (k in 1:nrow(temp)) { | |
out_content$frogs[[length(out_content$frogs)+1]]=list(name=temp$name[k],vernacularName=temp$vernacularName[k],audio=temp$audio_url[k],audio_reference=temp$audio_reference[k],image=temp$image_url[k],image_reference=temp$image_reference[k]) | |
} | |
} | |
json=toJSON(out_content,auto_unbox=TRUE) | |
write(json,file=outfile,append=FALSE) | |
## optionally write html file just for a quick test | |
if (FALSE) { | |
outfile=paste0(gsub(" ","_",this_region),".html") | |
write("<!DOCTYPE html>\n<html>\n<body>\n",file=outfile,append=FALSE) | |
write("<script>function play_all() { var a=document.getElementsByClassName(\"noisy\"); for (var i=0;i<a.length;i++) a[i].play(); }</script>",file=outfile,append=TRUE) | |
write("<script>function pause_all() { var a=document.getElementsByClassName(\"noisy\"); for (var i=0;i<a.length;i++) a[i].pause(); }</script>",file=outfile,append=TRUE) | |
write("<button type=\"button\" onclick=\"play_all()\">Play all</button> <button type=\"button\" onclick=\"pause_all()\">Pause all</button><br />",file=outfile,append=TRUE) | |
last_class="" | |
for (k in 1:nrow(this_data)) { | |
if (! last_class==this_data$class[k]) { | |
if (! last_class=="") write("</p>",file=outfile,append=TRUE) | |
write(sprintf("<p><strong>%s</strong></p>",this_data$class[k]),file=outfile,append=TRUE) | |
} | |
last_class=this_data$class[k] | |
write(sprintf("<p>%s <em>%s</em> ",this_data$vernacularName[k],this_data$name[k]),file=outfile,append=TRUE) | |
if (FALSE) {##(!is.na(this_data$image[k])) { | |
write(sprintf("<img width=\"100px\" src=\"%s\" />",this_data$image[k]),file=outfile,append=TRUE) | |
} | |
write(sprintf("<audio class=\"noisy\" controls loop autoplay><source src=\"%s\" type=\"audio/mpeg\">Your browser does not support the audio element.</audio></p>",this_data$audio[k]),file=outfile,append=TRUE) | |
} | |
write("</body>\n</html>",file=outfile,append=TRUE) | |
##browseURL(outfile) | |
} | |
} ## end looping regions | |
## For the round 2 entry, we used a slightly different data file format. | |
## Combine and reformat the JSON files now | |
x=data.frame() | |
ff=dir(".",pattern = "^[A-Z].*.json$") | |
locs=list() | |
for (i in 1:length(ff)) { | |
thisx=fromJSON(ff[i]) | |
this_region=thisx$locality | |
this_season="" | |
if (grepl("winter",this_region)) this_season="winter" | |
if (grepl("summer",this_region)) this_season="summer" | |
this_region_bare=sub(" \\((winter|summer)\\)","",this_region) | |
if (length(thisx$birds)>0) { | |
thisx$birds$region=this_region | |
thisx$birds$season=this_season | |
thisx$birds$type="bird" | |
} else { | |
thisx$birds=data.frame() | |
} | |
if (length(thisx$frogs)>0) { | |
thisx$frogs$region=this_region | |
thisx$frogs$season=this_season | |
thisx$frogs$type="frog" | |
} else { | |
thisx$frogs=data.frame() | |
} | |
this_data=rbind(thisx$birds,thisx$frogs) | |
if (i==1) { | |
x=this_data | |
} else { | |
x=rbind(x,this_data) | |
} | |
if (!any(sapply(locs,function(z)z$name==this_region_bare))) { | |
locs[[length(locs)+1]]=list(name=this_region_bare,bounds=list(c(thisx$lat[1],thisx$lon[1]),c(thisx$lat[2],thisx$lon[2]))) | |
} | |
} | |
## Now consolidate to one entry per taxon | |
## JSON formatting is a bit tedious here, to ensure proper array/string formatting (see jsonlite::unbox) | |
nu=unique(x$name) | |
jx=list() | |
for (ni in 1:length(nu)) { | |
idx=x$name==nu[ni] | |
this_region=sort(unique(x$region[idx])) | |
this_season=rep("",length(this_region)) | |
this_season[grepl("winter",this_region)]="winter" | |
this_season[grepl("summer",this_region)]="summer" | |
this_region=sub(" \\((winter|summer)\\)","",this_region) | |
temp=as.list(x[which(idx)[1],]) | |
temp$region=this_region | |
temp$season=this_season | |
## convert to JSON now so we can control unboxing | |
jx[[ni]]=toJSON(list(name=unbox(temp$name),vernacularName=unbox(temp$vernacularName),audio=unbox(temp$audio),audio_reference=unbox(temp$audio_reference),image=unbox(temp$image),image_reference=unbox(temp$image_reference),type=unbox(temp$type),region=temp$region,season=temp$season)) | |
} | |
json=toJSON(list(localities=locs),auto_unbox=TRUE) | |
json=substr(json,1,nchar(json)-1) ## drop trailing } | |
jx=paste(jx,collapse=",") | |
json=paste0(json,",\"taxa\":[",jx,"]}") | |
write(json,file="soundscape2.json",append=FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment