Skip to content

Instantly share code, notes, and snippets.

@rmarrotte
Created June 7, 2016 18:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rmarrotte/c6e8c79d69f481670167354ec9b3d65e to your computer and use it in GitHub Desktop.
Save rmarrotte/c6e8c79d69f481670167354ec9b3d65e to your computer and use it in GitHub Desktop.
######################
## Web scraping ##
######################
# Clear workspace
rm(list=ls())
# Load libraries
require(RCurl)
require(XML)
# Example articles
titles <- c("Landscape genetics and the spatial distribution of chronic wasting disease",
"Landscape‐genetic analysis of population structure in the Texas gray fox oral rabies vaccination zone",
"The subtle role of climate change on population genetic structure in Canada lynx",
"Gene flow and functional connectivity in the natterjack toad",
"Optimizing dispersal and corridor models using landscape genetics",
"Defining population structure for the Mojave desert tortoise",
"Ecological factors drive differentiation in wolves from British Columbia",
"Wolverine gene flow across a narrow climatic niche",
"Puerto Rico and Florida manatees represent genetically distinct groups",
"Circuit theory predicts gene flow in plant and animal populations")
hits <- c()
for(i in 1:length(titles)){
print("________________")
# Replace white space with %20 or +
search.term <- gsub(pattern = ' ', replacement = '+', titles[i])
# Add double quotes at the extremeties
search.term <- paste('%22', search.term, '%22', sep='')
# Add extra
search.term <- paste(search.term,"filetype:pdf+site:*.gov.*+OR+site:.gov.+OR+site:*.gc.*+OR+site:*.gouv.*",sep="+")
# Google search URL
googleURL <- paste('http://www.google.ca/search?q=', search.term, sep='')
print(googleURL)
# Make output var for the URLs
URLs <- c()
# Keep looping while the maximum number of results is larger than the number of URLs found
# Basically, keep turning google search pages while there are still pages.
while(length(URLs) < 200){# Usually there is a limit to the number URLs
# A random wait time so google doesnt get mad
Sys.sleep(time = runif(n = 1, min = 5, max = 60))
# Make search
doc <- getURL(googleURL, httpheader = c("User-Agent" = "R(3.10.0)"))
# If google gets made change local IP and wait 300s to reconnect
round <- 0
while(grepl(x = doc, pattern = "302 Moved") & round < 10){
print("Google is mad waiting 10 mins")
system("ipconfig /release",wait = T)
Sys.sleep(time = 600)
system("ipconfig /renew",wait = T)
doc <- getURL(googleURL, httpheader = c("User-Agent" = "R(3.10.0)"))
if(round == 10){
stop("Google won't let us scrap")
}
round <- round + 1
}
rm(round)
# If no results break the loop and returns nothing URL is empty
if(grepl(x = doc, pattern = "No results found for ",ignore.case = T) |
grepl(x = doc, pattern = "did not match any documents",ignore.case = T)){
break
}else{
# Parse google results
html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function
(...){})
nodes <- getNodeSet(html, "//h3[@class='r']//a")
results <- sapply(nodes, function(x) x <- xmlAttrs(x)[["href"]])
# Check if these new URLs are found in URLs, only if URLs is not empty
if(!is.null(URLs)){
test <- any(unlist(lapply(X = results, FUN = function(X){any(X == URLs)})))
# If there are repeat then break the loop, stop adding
if(test){
break
}
rm(test)
}
# If nothing is abnormal, add results to URLs
URLs <- c(URLs,results)
}
# If the results != 10 than we stop the search, the page usually contains <= 10 results
# If it doesnt, that means it is the last page
if(length(results) < 10){
break
}
# Change google search page
googleURL <- paste(googleURL, "&start=", length(URLs), sep = "")
# Clean up
rm(results, doc, html, nodes, round)
}
# What we found
print(URLs)
# Add to output
hits <- c(hits,length(URLs))
# Clean up again
rm(search.term,googleURL,URLs)
}
rm(i)
data.frame(titles,hits)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment