mcguinlu/medRxiv_scraper.R

## medRxiv_scraper.R
library(stringr)
library(rvest)

# Remember to edit the link to display all search results on a single page
# Save results page as html file and then read it in.
# Robots.txt disallows scraping of search/ paths, so this approach is a hacky fix
# Robots.txt has no problem with scraping content/ paths

h <- read_html("../../Downloads/medRxiv_test.html")

tmp <- h %>%
  html_nodes(".highwire-cite-linked-title") %>%
  html_attr('href') %>%
  as.data.frame()

for (x in 1:2) {
  tmp2 <- read_html(paste0("https://www.medrxiv.org/",tmp[x,]))

  authors <- tmp2 %>%
    html_node(".nlm-surname") %>%
    html_text() %>%
    as.data.frame()

  first_author <- authors[1,]

  year <- tmp2 %>%
    html_node(".pane-1") %>%
    html_text %>%
    trimws() %>%
    str_extract("[0-9]{4}")

  file_location <-  tmp2 %>%
    html_node(".article-dl-pdf-link") %>%
    html_attr('href')

  file_location <- gsub(x = file_location, pattern = "-text", "")

  download.file(paste0("https://www.medrxiv.org",file_location), paste0(first_author,"-",year,".pdf"), mode="wb")

  # Robots.txt requests a 10s delay between scraping
  sleep_time <- runif(1,10,11)
  Sys.sleep(sleep_time)
}
	library(stringr)
	library(rvest)

	# Remember to edit the link to display all search results on a single page
	# Save results page as html file and then read it in.
	# Robots.txt disallows scraping of search/ paths, so this approach is a hacky fix
	# Robots.txt has no problem with scraping content/ paths

	h <- read_html("../../Downloads/medRxiv_test.html")

	tmp <- h %>%
	html_nodes(".highwire-cite-linked-title") %>%
	html_attr('href') %>%
	as.data.frame()

	for (x in 1:2) {
	tmp2 <- read_html(paste0("https://www.medrxiv.org/",tmp[x,]))

	authors <- tmp2 %>%
	html_node(".nlm-surname") %>%
	html_text() %>%
	as.data.frame()

	first_author <- authors[1,]

	year <- tmp2 %>%
	html_node(".pane-1") %>%
	html_text %>%
	trimws() %>%
	str_extract("[0-9]{4}")

	file_location <- tmp2 %>%
	html_node(".article-dl-pdf-link") %>%
	html_attr('href')

	file_location <- gsub(x = file_location, pattern = "-text", "")

	download.file(paste0("https://www.medrxiv.org",file_location), paste0(first_author,"-",year,".pdf"), mode="wb")

	# Robots.txt requests a 10s delay between scraping
	sleep_time <- runif(1,10,11)
	Sys.sleep(sleep_time)
	}