Skip to content

Instantly share code, notes, and snippets.

@ClaudiuPapasteri
Last active March 25, 2023 18:40
Show Gist options
  • Save ClaudiuPapasteri/7bef34394c395e03ee074f884ddbf4d4 to your computer and use it in GitHub Desktop.
Save ClaudiuPapasteri/7bef34394c395e03ee074f884ddbf4d4 to your computer and use it in GitHub Desktop.
Scrape Google Scholar search
#' Scrape Google Scholar search
#'
#' @description Web scrape pages with scholar Google Scholar search results of a defined term.
#'
#' @param term String representing the Google Scholar search term. May include boolean search operators (e.g., "AND"), word search operators (e.g., "intext:") and symbol search operators (e.g., quotation marks " ").
#' @param pages Numeric vector of integer values representing the Google Search page numbers that need to be scraped.
#' @param crawl_delay Numeric vector of length one representing the number of seconds for the base crawl delay. This avoids being blocked by HTTP error 429 or Captcha.
#' @param ... Any additional httr config to use throughout the session. Using a Proxy to not be IP flagged and blocked is advised.
#'
#' @return Data frame with scraped results of Google Scholar search using term.
#' @export
#'
#' @examples
#' # Run single query
#' test1 <- scrape_gs(term = 'intext:"psychotherapy" AND "PTSD"', pages = 1:5) # use boolean operators and exact phrasing
#' test2 <- scrape_gs(term = 'intitle:"psychotherapy" AND "PTSD"', pages = 1:5)
#' all.equal(test1, test2) # different results based on intext/intitle search operators
#'
#' # Run multiple queries using list
#' queries <- list('"psychotherapy" AND "PTSD"', '"psychotherapy" AND "trauma"')
#' pages <- list(1:2, 2:4)
#' test_list1 <- Map(list, term = queries, pages = pages)
#' # equivalent definition for multiple queries using list
#' first <- list(term = '"psychotherapy" AND "PTSD"', pages = 1:2)
#' second <- list(term = '"psychotherapy" AND "trauma"', pages = 2:4)
#' test_list2 <- list(first, second)
#' identical(test_list1, test_list2) # lead to same structure
#'
#' test3 <- do.call(rbind, lapply(test_list2, function(x) do.call(scrape_gs, x)))
#'
#' @version "0.1"
#'
#' @author Claudiu C. Papasteri
#'
#' @license MIT License
scrape_gs <- function(term, pages, crawl_delay, ...) {
library(rvest)
library(httr)
gs_url_base <- "https://scholar.google.com/scholar"
# set httr config outside of function and use them inside ...; e.g.:
# useragent <- httr::user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36") # spoof user agent
# proxy <- httr::use_proxy(url = "proxy.com", port = 8080, username = "dave", password = "pass", auth = "basic")
result_list <- list()
i <- 1
for (n_page in (pages - 1)*10) { # gs page indexing starts with 0; there are 10 articles per page, see "?start=" param
gs_url <- paste0(gs_url_base, "?start=", n_page, "&q=", noquote(gsub("\\s+", "+", trimws(term))))
t0 <- Sys.time()
session <- rvest::session(gs_url, ...) # session$config$options$useragent
t1 <- Sys.time()
response_delay <- as.numeric(t1-t0) # backing off time
wbpage <- rvest::read_html(session)
# Avoid HTTP error 429 due to too many requests - use crawl delay & back off
Sys.sleep(crawl_delay + 3*response_delay + runif(n = 1, min = 0.5, max = 1))
if((i %% 10) == 0) { # sleep every 10 iterations
message("taking a break")
Sys.sleep(10 + 10*response_delay + runif(n = 1, min = 0, max = 1))
}
i <- i + 1
# Raw data
titles <- rvest::html_text(rvest::html_elements(wbpage, ".gs_rt"))
authors_years <- rvest::html_text(rvest::html_elements(wbpage, ".gs_a"))
part_abstracts <- rvest::html_text(rvest::html_elements(wbpage, ".gs_rs"))
bottom_row_nodes <- rvest::html_elements(wbpage, ".gs_fl")
bottom_row_nodes <- bottom_row_nodes[!grepl("gs_ggs gs_fl", as.character(bottom_row_nodes), fixed = TRUE)] # exclude the ones with this tag, they are download links
bottom_row <- rvest::html_text(bottom_row_nodes)
# Processed data
authors <- gsub("^(.*?)\\W+-\\W+.*", "\\1", authors_years, perl = TRUE)
years <- gsub("^.*(\\d{4}).*", "\\1", authors_years, perl = TRUE)
citations <- strsplit(gsub("(?!^)(?=[[:upper:]])", " ", bottom_row, perl = TRUE), " ") # split on capital letter to get Number of citations link
citations <- lapply(citations, "[", 3)
n_citations <- suppressWarnings(as.numeric(sub("\\D*(\\d+).*", "\\1", citations)))
# Store in list
result_list <- append(
result_list,
list(
list(
page = n_page/10 + 1,
term = term,
title = titles,
authors = authors,
year = years,
n_citations = n_citations,
abstract = part_abstracts
)
)
)
}
# Return as data frame
result_df <- lapply(result_list, as.data.frame)
result_df <- as.data.frame(do.call(rbind, result_df))
result_df
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment