Created
June 7, 2016 18:24
-
-
Save rmarrotte/c6e8c79d69f481670167354ec9b3d65e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###################### | |
## Web scraping ## | |
###################### | |
# Clear workspace | |
rm(list=ls()) | |
# Load libraries | |
require(RCurl) | |
require(XML) | |
# Example articles | |
titles <- c("Landscape genetics and the spatial distribution of chronic wasting disease", | |
"Landscape‐genetic analysis of population structure in the Texas gray fox oral rabies vaccination zone", | |
"The subtle role of climate change on population genetic structure in Canada lynx", | |
"Gene flow and functional connectivity in the natterjack toad", | |
"Optimizing dispersal and corridor models using landscape genetics", | |
"Defining population structure for the Mojave desert tortoise", | |
"Ecological factors drive differentiation in wolves from British Columbia", | |
"Wolverine gene flow across a narrow climatic niche", | |
"Puerto Rico and Florida manatees represent genetically distinct groups", | |
"Circuit theory predicts gene flow in plant and animal populations") | |
hits <- c() | |
for(i in 1:length(titles)){ | |
print("________________") | |
# Replace white space with %20 or + | |
search.term <- gsub(pattern = ' ', replacement = '+', titles[i]) | |
# Add double quotes at the extremeties | |
search.term <- paste('%22', search.term, '%22', sep='') | |
# Add extra | |
search.term <- paste(search.term,"filetype:pdf+site:*.gov.*+OR+site:.gov.+OR+site:*.gc.*+OR+site:*.gouv.*",sep="+") | |
# Google search URL | |
googleURL <- paste('http://www.google.ca/search?q=', search.term, sep='') | |
print(googleURL) | |
# Make output var for the URLs | |
URLs <- c() | |
# Keep looping while the maximum number of results is larger than the number of URLs found | |
# Basically, keep turning google search pages while there are still pages. | |
while(length(URLs) < 200){# Usually there is a limit to the number URLs | |
# A random wait time so google doesnt get mad | |
Sys.sleep(time = runif(n = 1, min = 5, max = 60)) | |
# Make search | |
doc <- getURL(googleURL, httpheader = c("User-Agent" = "R(3.10.0)")) | |
# If google gets made change local IP and wait 300s to reconnect | |
round <- 0 | |
while(grepl(x = doc, pattern = "302 Moved") & round < 10){ | |
print("Google is mad waiting 10 mins") | |
system("ipconfig /release",wait = T) | |
Sys.sleep(time = 600) | |
system("ipconfig /renew",wait = T) | |
doc <- getURL(googleURL, httpheader = c("User-Agent" = "R(3.10.0)")) | |
if(round == 10){ | |
stop("Google won't let us scrap") | |
} | |
round <- round + 1 | |
} | |
rm(round) | |
# If no results break the loop and returns nothing URL is empty | |
if(grepl(x = doc, pattern = "No results found for ",ignore.case = T) | | |
grepl(x = doc, pattern = "did not match any documents",ignore.case = T)){ | |
break | |
}else{ | |
# Parse google results | |
html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function | |
(...){}) | |
nodes <- getNodeSet(html, "//h3[@class='r']//a") | |
results <- sapply(nodes, function(x) x <- xmlAttrs(x)[["href"]]) | |
# Check if these new URLs are found in URLs, only if URLs is not empty | |
if(!is.null(URLs)){ | |
test <- any(unlist(lapply(X = results, FUN = function(X){any(X == URLs)}))) | |
# If there are repeat then break the loop, stop adding | |
if(test){ | |
break | |
} | |
rm(test) | |
} | |
# If nothing is abnormal, add results to URLs | |
URLs <- c(URLs,results) | |
} | |
# If the results != 10 than we stop the search, the page usually contains <= 10 results | |
# If it doesnt, that means it is the last page | |
if(length(results) < 10){ | |
break | |
} | |
# Change google search page | |
googleURL <- paste(googleURL, "&start=", length(URLs), sep = "") | |
# Clean up | |
rm(results, doc, html, nodes, round) | |
} | |
# What we found | |
print(URLs) | |
# Add to output | |
hits <- c(hits,length(URLs)) | |
# Clean up again | |
rm(search.term,googleURL,URLs) | |
} | |
rm(i) | |
data.frame(titles,hits) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment