rmarrotte/webscrap_science_apps.R

## webscrap_science_apps.R
######################
##   Web scraping   ##
######################

# Clear workspace
rm(list=ls())

# Load libraries
require(RCurl)
require(XML)

# Example articles
titles <- c("Landscape genetics and the spatial distribution of chronic wasting disease",
           "Landscape‐genetic analysis of population structure in the Texas gray fox oral rabies vaccination zone",
           "The subtle role of climate change on population genetic structure in Canada lynx",
           "Gene flow and functional connectivity in the natterjack toad",
           "Optimizing dispersal and corridor models using landscape genetics",
           "Defining population structure for the Mojave desert tortoise",
           "Ecological factors drive differentiation in wolves from British Columbia",
           "Wolverine gene flow across a narrow climatic niche",
           "Puerto Rico and Florida manatees represent genetically distinct groups",
           "Circuit theory predicts gene flow in plant and animal populations")

hits <- c()
for(i in 1:length(titles)){
  print("________________")

  # Replace white space with %20 or +
  search.term <- gsub(pattern = ' ', replacement = '+', titles[i])

  # Add double quotes at the extremeties
  search.term <- paste('%22', search.term, '%22', sep='')

  # Add extra
  search.term <- paste(search.term,"filetype:pdf+site:*.gov.*+OR+site:.gov.+OR+site:*.gc.*+OR+site:*.gouv.*",sep="+")

  # Google search URL
  googleURL <- paste('http://www.google.ca/search?q=', search.term, sep='')
  print(googleURL)

  # Make output var for the URLs
  URLs <- c()

  # Keep looping while the maximum number of results is larger than the number of URLs found
  # Basically, keep turning google search pages while there are still pages.
  while(length(URLs) < 200){# Usually there is a limit to the number URLs
    # A random wait time so google doesnt get mad
    Sys.sleep(time = runif(n = 1, min = 5, max = 60))

    # Make search
    doc <- getURL(googleURL, httpheader = c("User-Agent" = "R(3.10.0)"))

    # If google gets made change local IP and wait 300s to reconnect
    round <- 0
    while(grepl(x = doc, pattern = "302 Moved") & round < 10){
      print("Google is mad waiting 10 mins")
      system("ipconfig /release",wait = T)
      Sys.sleep(time = 600)
      system("ipconfig /renew",wait = T)
      doc <- getURL(googleURL, httpheader = c("User-Agent" = "R(3.10.0)"))
      if(round == 10){
        stop("Google won't let us scrap")
      }
      round <- round + 1
    }
    rm(round)

    # If no results break the loop and returns nothing URL is empty
    if(grepl(x = doc, pattern = "No results found for ",ignore.case = T) |
       grepl(x = doc, pattern = "did not match any documents",ignore.case = T)){
      break
    }else{
      # Parse google results
      html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function
                            (...){})
      nodes <- getNodeSet(html, "//h3[@class='r']//a")
      results <- sapply(nodes, function(x) x <- xmlAttrs(x)[["href"]])

      # Check if these new URLs are found in URLs, only if URLs is not empty
      if(!is.null(URLs)){
        test <- any(unlist(lapply(X = results, FUN = function(X){any(X == URLs)})))
        # If there are repeat then break the loop, stop adding
        if(test){
          break
        }
        rm(test)
      }

      # If nothing is abnormal, add results to URLs
      URLs <- c(URLs,results)
    }

    # If the results != 10 than we stop the search, the page usually contains <= 10 results
    # If it doesnt, that means it is the last page
    if(length(results) < 10){
      break
    }

    # Change google search page
    googleURL <- paste(googleURL, "&start=", length(URLs), sep = "")

    # Clean up
    rm(results, doc, html, nodes, round)
  }

  # What we found
  print(URLs)

  # Add to output
  hits <- c(hits,length(URLs))

  # Clean up again
  rm(search.term,googleURL,URLs)
}
rm(i)


data.frame(titles,hits)
	######################
	## Web scraping ##
	######################

	# Clear workspace
	rm(list=ls())

	# Load libraries
	require(RCurl)
	require(XML)

	# Example articles
	titles <- c("Landscape genetics and the spatial distribution of chronic wasting disease",
	"Landscape‐genetic analysis of population structure in the Texas gray fox oral rabies vaccination zone",
	"The subtle role of climate change on population genetic structure in Canada lynx",
	"Gene flow and functional connectivity in the natterjack toad",
	"Optimizing dispersal and corridor models using landscape genetics",
	"Defining population structure for the Mojave desert tortoise",
	"Ecological factors drive differentiation in wolves from British Columbia",
	"Wolverine gene flow across a narrow climatic niche",
	"Puerto Rico and Florida manatees represent genetically distinct groups",
	"Circuit theory predicts gene flow in plant and animal populations")

	hits <- c()
	for(i in 1:length(titles)){
	print("________________")

	# Replace white space with %20 or +
	search.term <- gsub(pattern = ' ', replacement = '+', titles[i])

	# Add double quotes at the extremeties
	search.term <- paste('%22', search.term, '%22', sep='')

	# Add extra
	search.term <- paste(search.term,"filetype:pdf+site:.gov.+OR+site:.gov.+OR+site:.gc.+OR+site:.gouv.",sep="+")

	# Google search URL
	googleURL <- paste('http://www.google.ca/search?q=', search.term, sep='')
	print(googleURL)

	# Make output var for the URLs
	URLs <- c()

	# Keep looping while the maximum number of results is larger than the number of URLs found
	# Basically, keep turning google search pages while there are still pages.
	while(length(URLs) < 200){# Usually there is a limit to the number URLs
	# A random wait time so google doesnt get mad
	Sys.sleep(time = runif(n = 1, min = 5, max = 60))

	# Make search
	doc <- getURL(googleURL, httpheader = c("User-Agent" = "R(3.10.0)"))

	# If google gets made change local IP and wait 300s to reconnect
	round <- 0
	while(grepl(x = doc, pattern = "302 Moved") & round < 10){
	print("Google is mad waiting 10 mins")
	system("ipconfig /release",wait = T)
	Sys.sleep(time = 600)
	system("ipconfig /renew",wait = T)
	doc <- getURL(googleURL, httpheader = c("User-Agent" = "R(3.10.0)"))
	if(round == 10){
	stop("Google won't let us scrap")
	}
	round <- round + 1
	}
	rm(round)

	# If no results break the loop and returns nothing URL is empty
	if(grepl(x = doc, pattern = "No results found for ",ignore.case = T) \|
	grepl(x = doc, pattern = "did not match any documents",ignore.case = T)){
	break
	}else{
	# Parse google results
	html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function
	(...){})
	nodes <- getNodeSet(html, "//h3[@class='r']//a")
	results <- sapply(nodes, function(x) x <- xmlAttrs(x)[["href"]])

	# Check if these new URLs are found in URLs, only if URLs is not empty
	if(!is.null(URLs)){
	test <- any(unlist(lapply(X = results, FUN = function(X){any(X == URLs)})))
	# If there are repeat then break the loop, stop adding
	if(test){
	break
	}
	rm(test)
	}

	# If nothing is abnormal, add results to URLs
	URLs <- c(URLs,results)
	}

	# If the results != 10 than we stop the search, the page usually contains <= 10 results
	# If it doesnt, that means it is the last page
	if(length(results) < 10){
	break
	}

	# Change google search page
	googleURL <- paste(googleURL, "&start=", length(URLs), sep = "")

	# Clean up
	rm(results, doc, html, nodes, round)
	}

	# What we found
	print(URLs)

	# Add to output
	hits <- c(hits,length(URLs))

	# Clean up again
	rm(search.term,googleURL,URLs)
	}
	rm(i)


	data.frame(titles,hits)