Created
July 2, 2020 14:29
-
-
Save mcguinlu/f77568128720e61bac1380b228bec33b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(medrxivr) | |
# Create bx_api_content | |
# Modified version of mx_api_content() | |
bx_api_content <- function(from.date = "2013-01-01", | |
to.date = Sys.Date(), | |
clean = TRUE, | |
include.info = FALSE) { | |
# Create baseline link | |
base_link <- paste0("https://api.biorxiv.org/details/biorxiv/", | |
from.date, | |
"/", | |
to.date) | |
details <- | |
httr::RETRY( | |
verb = "GET", | |
times = 3, | |
url = paste0(base_link, "/0"), | |
httr::timeout(30) | |
) %>% | |
httr::content(as = "text", encoding = "UTF-8") %>% | |
jsonlite::fromJSON() | |
# Check if API is working? | |
count <- details$messages[1,6] | |
message("Total number of records found: ",count) | |
pages <- floor(count/100) | |
# Create empty dataset | |
df <- details$collection %>% | |
dplyr::filter(doi == "") | |
# Get data | |
message("Starting extraction from API") | |
for (cursor in 0:pages) { | |
page <- cursor*100 | |
message(paste0("Extracting records ",page+1," to ",page+100, " of ", count)) | |
link <- paste0(base_link,"/",page) | |
tmp <- httr::RETRY(verb = "GET", url = link) %>% | |
httr::content(as = "text", encoding = "UTF-8") %>% | |
jsonlite::fromJSON() | |
tmp <- tmp$collection | |
df <- rbind(df, tmp) | |
} | |
# Clean data | |
if (clean == TRUE) { | |
df$node <- seq_len(nrow(df)) | |
df <- df %>% | |
dplyr::select(-c(.data$type,.data$server)) | |
df$link <- paste0("/content/",df$doi,"v",df$version,"?versioned=TRUE") | |
df$pdf <- paste0("/content/",df$doi,"v",df$version,".full.pdf") | |
df$category <- stringr::str_to_title(df$category) | |
df$authors <- stringr::str_to_title(df$authors) | |
df$author_corresponding <- stringr::str_to_title(df$author_corresponding) | |
} | |
if (include.info == TRUE) { | |
details <- | |
details$messages %>% dplyr::slice(rep(1:dplyr::n(), each = nrow(df))) | |
df <- cbind(df, details) | |
} | |
df | |
} | |
# Will download a copy of the bioRxiv repository | |
# Takes a long time! | |
bx_data <- bx_api_content() | |
# Then pass this data to mx_search, which will work as normal | |
results <- mx_search(data = bx_data, | |
query = "test") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment