Create a gist now

Instantly share code, notes, and snippets.

Embed
What would you like to do?
# best practices for web scraping in R // ldply
# best practices for web scraping in R #
# function should be used with ldply
# eg:
ldply(urls, scrape)
# add a try to ignore broken links/ unresponsive pages
# eg:
ldply(urls, function(url){
out = try(scrape(url))
if(class(out)=='try-error') next;
return(out)
})
# insert some random sleep interval to prevent getting booted
# eg:
ldply(urls, function(url){
out = try(scrape(url))
if(class(out)=='try-error') next;
Sys.sleep(sample(seq(1, 3, by=0.001), 1))
return(out)
})
scrape <- function(url)
{
if(!require('XML')){
install.packages('XML')
library('XML')
}
if(!require('RCurl')){
install.packages('RCurl')
library('RCurl')
}
if(!require('plyr')){
install.packages('plyr')
library('plyr')
}
if(!require('stringr')){
install.packages('stringr')
library('stringr')
}
df = data.frame(url=url, stringsAsFactors=F)
#download page, use "readLines" if "getURL" fails
html = try(getURL(df$url))
if(class(html)=='try-error'){
html = readLines(df$url, warn=F)
}
tree = htmlTreeParse(html, useInternalNodes=T)
#@@@@@@@@@@@@@@@@@@@@#
| |
| ENTER XPATH HERE: |
| |
#$$$$$$$$$$$$$$$$$$$$#
}
}
return(data.frame(df, stringsAsFactors=F))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment