Created
September 30, 2019 10:00
-
-
Save mcguinlu/102b1d11a9f695ee0a89a4b2257f442f to your computer and use it in GitHub Desktop.
Script to download the PDF of each result of a search on medRxiv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(stringr) | |
library(rvest) | |
# Remember to edit the link to display all search results on a single page | |
# Save results page as html file and then read it in. | |
# Robots.txt disallows scraping of search/ paths, so this approach is a hacky fix | |
# Robots.txt has no problem with scraping content/ paths | |
h <- read_html("../../Downloads/medRxiv_test.html") | |
tmp <- h %>% | |
html_nodes(".highwire-cite-linked-title") %>% | |
html_attr('href') %>% | |
as.data.frame() | |
for (x in 1:2) { | |
tmp2 <- read_html(paste0("https://www.medrxiv.org/",tmp[x,])) | |
authors <- tmp2 %>% | |
html_node(".nlm-surname") %>% | |
html_text() %>% | |
as.data.frame() | |
first_author <- authors[1,] | |
year <- tmp2 %>% | |
html_node(".pane-1") %>% | |
html_text %>% | |
trimws() %>% | |
str_extract("[0-9]{4}") | |
file_location <- tmp2 %>% | |
html_node(".article-dl-pdf-link") %>% | |
html_attr('href') | |
file_location <- gsub(x = file_location, pattern = "-text", "") | |
download.file(paste0("https://www.medrxiv.org",file_location), paste0(first_author,"-",year,".pdf"), mode="wb") | |
# Robots.txt requests a 10s delay between scraping | |
sleep_time <- runif(1,10,11) | |
Sys.sleep(sleep_time) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Need to change this to make use of the
/archive/
folderhttps://www.medrxiv.org/archive?field_highwire_a_epubdate_value%5Bvalue%5D&page=2