Skip to content

Instantly share code, notes, and snippets.

@mcguinlu
Created July 2, 2020 14:29
Show Gist options
  • Save mcguinlu/f77568128720e61bac1380b228bec33b to your computer and use it in GitHub Desktop.
Save mcguinlu/f77568128720e61bac1380b228bec33b to your computer and use it in GitHub Desktop.
library(dplyr)
library(medrxivr)
# Create bx_api_content
# Modified version of mx_api_content()
bx_api_content <- function(from.date = "2013-01-01",
to.date = Sys.Date(),
clean = TRUE,
include.info = FALSE) {
# Create baseline link
base_link <- paste0("https://api.biorxiv.org/details/biorxiv/",
from.date,
"/",
to.date)
details <-
httr::RETRY(
verb = "GET",
times = 3,
url = paste0(base_link, "/0"),
httr::timeout(30)
) %>%
httr::content(as = "text", encoding = "UTF-8") %>%
jsonlite::fromJSON()
# Check if API is working?
count <- details$messages[1,6]
message("Total number of records found: ",count)
pages <- floor(count/100)
# Create empty dataset
df <- details$collection %>%
dplyr::filter(doi == "")
# Get data
message("Starting extraction from API")
for (cursor in 0:pages) {
page <- cursor*100
message(paste0("Extracting records ",page+1," to ",page+100, " of ", count))
link <- paste0(base_link,"/",page)
tmp <- httr::RETRY(verb = "GET", url = link) %>%
httr::content(as = "text", encoding = "UTF-8") %>%
jsonlite::fromJSON()
tmp <- tmp$collection
df <- rbind(df, tmp)
}
# Clean data
if (clean == TRUE) {
df$node <- seq_len(nrow(df))
df <- df %>%
dplyr::select(-c(.data$type,.data$server))
df$link <- paste0("/content/",df$doi,"v",df$version,"?versioned=TRUE")
df$pdf <- paste0("/content/",df$doi,"v",df$version,".full.pdf")
df$category <- stringr::str_to_title(df$category)
df$authors <- stringr::str_to_title(df$authors)
df$author_corresponding <- stringr::str_to_title(df$author_corresponding)
}
if (include.info == TRUE) {
details <-
details$messages %>% dplyr::slice(rep(1:dplyr::n(), each = nrow(df)))
df <- cbind(df, details)
}
df
}
# Will download a copy of the bioRxiv repository
# Takes a long time!
bx_data <- bx_api_content()
# Then pass this data to mx_search, which will work as normal
results <- mx_search(data = bx_data,
query = "test")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment