Skip to content

Instantly share code, notes, and snippets.

@PhillRob
Last active December 31, 2015 07:24
Show Gist options
  • Save PhillRob/e50ad5100aec62ad880c to your computer and use it in GitHub Desktop.
Save PhillRob/e50ad5100aec62ad880c to your computer and use it in GitHub Desktop.
scraping for dispersal data on the kew/sid website.
disp.sp <- function(x) {
print(x)
x.genus <- unlist( strsplit(x," "))[1]
x.species <- unlist( strsplit(x," "))[2]
sp.test <- getURL(paste0("http://data.kew.org/sid/SidServlet?Clade=&Order=&Family=&APG=off&Genus=", x.genus,"&Species=",x.species,"&StorBehav=0&DsFlag=on"))
if (grepl("0 records found.", sp.test))
{
#return("Not_found")
results<-c(x,"Not_found")
}
if (grepl("1 records found.", sp.test))
{
flush.console()
sp.id.st <- unlist(gregexpr(pattern = "SidServlet?ID=", sp.test, ignore.case = F,
fixed = T))
sp.id.end <- unlist(gregexpr(pattern = "&Num",
sp.test, ignore.case = F, fixed = T))
sp.id <- unique(substr(sp.test, sp.id.st + 14, sp.id.end-1))
pattern.a<-c(paste0(">",x.genus, " ",x.species))
num.st <- unlist(gregexpr(pattern = "&Num=", sp.test, ignore.case = F,
fixed = T))
num.end <- unlist(gregexpr(pattern = pattern.a,
sp.test, ignore.case = F, fixed = T))
num.id <- unique(substr(sp.test, num.st , num.end-2))
sp <- getURL(paste0("http://data.kew.org/sid/SidServlet?ID=", sp.id,num.id))
if (grepl("This record does not exist.", sp))
{
results<-c(x,"Not_found")
} else {
disp.st <- unlist(gregexpr(pattern = "</A>Seed Dispersal</B></SPAN><br>", sp, ignore.case = F,
fixed = T))
disp.end <- unlist(gregexpr(pattern = "/sid/dispersal.html", sp, ignore.case = F, fixed = T))
#disp.end[1]
dispersal <- unique(substr(sp, disp.st + 50, disp.end[1]-69))
#dispersal
results<-c(paste(x.genus, x.species), dispersal)
}}
print(results)
return(results)
}
@PhillRob
Copy link
Author

needs Rcurl and eats a species list
library("RCurl", lib.loc="~/R/x86_64-pc-linux-gnu-library/3.2")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment