Skip to content

Instantly share code, notes, and snippets.

@Gabryxx7
Last active August 8, 2021 01:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Gabryxx7/1a35eb57d93b48f41ce1a62ce6f61b66 to your computer and use it in GitHub Desktop.
Save Gabryxx7/1a35eb57d93b48f41ce1a62ce6f61b66 to your computer and use it in GitHub Desktop.
Tidy up bibtext references with R: Adds missing DOIs from titles and converts each citation to the correct format (I'm using acm).
if(!requireNamespace("remotes", quietly = TRUE)) { install.packages("remotes") }
if(!requireNamespace("remotes", quietly = TRUE)) { install.packages("devtools") }
remotes::install_github("wkmor1/doi2bib", upgrade ="never")
remotes::install_github("ropensci/fulltext", upgrade ="never")
remotes::install_github("ropensci/bibtex", upgrade ="never")
devtools::install_github("quanteda/readtext", upgrade ="never")
remotes::install_github("ropensci/rcrossref", upgrade ="never")
remotes::install_github("ropensci/rplos")
remotes:install_github("ropensci/aRxiv")
remotes::install.packages("RecordLinkage")
devtools::install_github("ropensci/bib2df")
library(fulltext)
library(bib2df)
# library(readtext)
library(doi2bib)
library(dplyr)
library(curl)
library(stringr)
library(RecordLinkage)
library(rcrossref)
library(aRxiv)
library(foreach)
library(doParallel)
base_folder = "/Users/marinig/Documents/GitHub/R_utils/"
filename <- paste0(base_folder, "example.bib")
# bib.data <- read.bib(paste0(base_folder, filename))
out_file <- paste0(base_folder, "curltest_2.bib")
temp_file <- paste0(base_folder, "curltest_2.tmp.bib")
# print(bib.data)
file.name <- system.file("Bib", filename, package="RefManageR")
test_bib <- ReadBib(filename, check=FALSE) # best way to parse
merge_bibs <- function(old_bib, new_bib, upd_title=FALSE, upd_authors=TRUE){
for(col in colnames(new_bib)){
if(is.na(old_bib[col])){
old_bib[col] <- str_replace_all(new_bib[col], "[{|}]", "")
}
if(col == "TITLE" && upd_title){
old_bib[col] <- str_replace_all(new_bib[col], "[{|}]", "")
}else if(col == "AUTHOR" && upd_authors){
old_bib[col] <- str_replace_all(new_bib[col], "[{|}]", "")
}
}
return(old_bib)
}
tidyUpBib <- function(bib_df, index, out_file, temp_file, only_add_dois=FALSE, is_cluster=FALSE){
if(is_cluster){
library(fulltext)
library(bib2df)
library(doi2bib)
library(dplyr)
library(curl)
library(stringr)
library(RecordLinkage)
library(rcrossref)
library(aRxiv)
library(foreach)
library(doParallel)
}
bib_entry <- bib_df[i,]
cat("\n---- Exporting", i, " / ", nrow(bib_df), "\t", bib_entry$BIBTEXKEY, ": ", bib_entry$TITLE, "\n")
dois <- c(str_replace_all(tolower(bib_entry$DOI),"[{|}]", ""))
title <- str_replace_all(bib_entry$TITLE, "[:|{|}]", "")
if(is.null(dois) || is.na(dois)){
cat(paste0("- No DOI looking for it on CrossRef and ARXIV \n"))
resCR <- cr_works(query = title, format = "text", style = "acm", limit=10) # https://docs.ropensci.org/rcrossref/reference/cr_works.html
# resPlos <- ft_search(query = title, from="plos")
resArxiv <- arxiv_search(query = noquote(paste0('ti:\"', title, '\"')), limit=10)
dois <- list(c(resArxiv$doi, resCR$data$doi))
dois <- lapply(dois, function(z){ z[!is.na(z) & z != ""]})[[1]]
cat("ARXIV DOIS: ", length(resArxiv$doi), "CR DOIS: ", length(resCR$data$doi), "Total: ", length(dois))
}
similarity_threshold <- 0.8
is_new <- FALSE
if (!is.null(dois) && !is.na(dois)){
j <- 1
while(j <= length(dois)){
doi <- dois[j]
cat("\nGetting data for DOI ", j, " of ", length(dois), ":\t", doi)
tryCatch({
citation <- cr_cn(dois = doi, format = "bibtex", style="acm", locale="en-US") %>%
write(file = temp_file, append = FALSE)
tryCatch({
new_bib_df <- bib2df(temp_file)
new_bib <- new_bib_df[1,]
new_bib$BIBTEXKEY <- bib_entry$BIBTEXKEY
new_title <- str_replace_all(tolower(new_bib$TITLE),"[:|{|}]", "")
old_title <- tolower(title)
titleSimilarity = levenshteinSim(new_title, old_title)
cat("\nNew Title: ", new_bib$TITLE, "\nTitle similarity: ", titleSimilarity, "\n")
if(titleSimilarity >= similarity_threshold){
if(only_add_dois){
cat("\nUpdating DOI...\n")
bib_entry$DOI <- new_bib$DOI
is_new <- FALSE
}
else{
bib_entry <- merge_bibs(bib_entry, new_boib[1,])
# bib_entry <- new_bib[1,]
is_new <- TRUE
}
break
}
},
error = function(e){
cat("\nError: ", paste0(e))
})
},
error = function(e){
cat("\nError: ", paste0(e))
})
j <- j + 1
}
if(is_new){
cat("\nWriting NEW bib...\n")
bib_entry$r_updated <- "YES"
}
else{
cat("\nWriting OLD bib...\n")
bib_entry$r_updated <- "NO"
}
}
return(bib_entry)
}
USE_MULTI_THREAD <- FALSE
if(!USE_MULTI_THREAD){
start_time <- Sys.time()
bib_df <- bib2df(filename)
cat("\nTotal refs: ", nrow(bib_df))
bib_list_result <- foreach(i=9:20)%do% {
invisible(tidyUpBib(bib_df, i, out_file, temp_file, FALSE, FALSE))
}
bib_df_result <- rbindlist(bib_list_result, fill=TRUE)
df2bib(bib_df_result, file = paste0(base_folder, "new_bib_single.bib"))
end_time <- Sys.time()
cat("Single threaded execution time: ", end_time - start_time)
} else {
start_time <- Sys.time()
bib_df <- bib2df(filename)
#setup parallel backend to use many processors
cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
bib_list_result <- foreach(i=1:nrow(bib_df), .combine = 'rbind') %dopar% {
updated_bib = tidyUpBib(bib_df, i, out_file, temp_file, FALSE, TRUE) #calling a function
cat("Finished ", i, " / ", nrow(bib_df))
#do other things if you want
updated_bib
}
#stop cluster
stopCluster(cl)
bib_df_result <- rbindlist(bib_list_result, fill=TRUE)
df2bib(bib_df_result, file = paste0(base_folder, "new_bib_multi.bib"))
end_time <- Sys.time()
cat("Multi threaded execution time: ", end_time - start_time)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment