Last active
August 8, 2021 01:41
-
-
Save Gabryxx7/1a35eb57d93b48f41ce1a62ce6f61b66 to your computer and use it in GitHub Desktop.
Tidy up bibtext references with R: Adds missing DOIs from titles and converts each citation to the correct format (I'm using acm).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
if(!requireNamespace("remotes", quietly = TRUE)) { install.packages("remotes") } | |
if(!requireNamespace("remotes", quietly = TRUE)) { install.packages("devtools") } | |
remotes::install_github("wkmor1/doi2bib", upgrade ="never") | |
remotes::install_github("ropensci/fulltext", upgrade ="never") | |
remotes::install_github("ropensci/bibtex", upgrade ="never") | |
devtools::install_github("quanteda/readtext", upgrade ="never") | |
remotes::install_github("ropensci/rcrossref", upgrade ="never") | |
remotes::install_github("ropensci/rplos") | |
remotes:install_github("ropensci/aRxiv") | |
remotes::install.packages("RecordLinkage") | |
devtools::install_github("ropensci/bib2df") | |
library(fulltext) | |
library(bib2df) | |
# library(readtext) | |
library(doi2bib) | |
library(dplyr) | |
library(curl) | |
library(stringr) | |
library(RecordLinkage) | |
library(rcrossref) | |
library(aRxiv) | |
library(foreach) | |
library(doParallel) | |
base_folder = "/Users/marinig/Documents/GitHub/R_utils/" | |
filename <- paste0(base_folder, "example.bib") | |
# bib.data <- read.bib(paste0(base_folder, filename)) | |
out_file <- paste0(base_folder, "curltest_2.bib") | |
temp_file <- paste0(base_folder, "curltest_2.tmp.bib") | |
# print(bib.data) | |
file.name <- system.file("Bib", filename, package="RefManageR") | |
test_bib <- ReadBib(filename, check=FALSE) # best way to parse | |
merge_bibs <- function(old_bib, new_bib, upd_title=FALSE, upd_authors=TRUE){ | |
for(col in colnames(new_bib)){ | |
if(is.na(old_bib[col])){ | |
old_bib[col] <- str_replace_all(new_bib[col], "[{|}]", "") | |
} | |
if(col == "TITLE" && upd_title){ | |
old_bib[col] <- str_replace_all(new_bib[col], "[{|}]", "") | |
}else if(col == "AUTHOR" && upd_authors){ | |
old_bib[col] <- str_replace_all(new_bib[col], "[{|}]", "") | |
} | |
} | |
return(old_bib) | |
} | |
tidyUpBib <- function(bib_df, index, out_file, temp_file, only_add_dois=FALSE, is_cluster=FALSE){ | |
if(is_cluster){ | |
library(fulltext) | |
library(bib2df) | |
library(doi2bib) | |
library(dplyr) | |
library(curl) | |
library(stringr) | |
library(RecordLinkage) | |
library(rcrossref) | |
library(aRxiv) | |
library(foreach) | |
library(doParallel) | |
} | |
bib_entry <- bib_df[i,] | |
cat("\n---- Exporting", i, " / ", nrow(bib_df), "\t", bib_entry$BIBTEXKEY, ": ", bib_entry$TITLE, "\n") | |
dois <- c(str_replace_all(tolower(bib_entry$DOI),"[{|}]", "")) | |
title <- str_replace_all(bib_entry$TITLE, "[:|{|}]", "") | |
if(is.null(dois) || is.na(dois)){ | |
cat(paste0("- No DOI looking for it on CrossRef and ARXIV \n")) | |
resCR <- cr_works(query = title, format = "text", style = "acm", limit=10) # https://docs.ropensci.org/rcrossref/reference/cr_works.html | |
# resPlos <- ft_search(query = title, from="plos") | |
resArxiv <- arxiv_search(query = noquote(paste0('ti:\"', title, '\"')), limit=10) | |
dois <- list(c(resArxiv$doi, resCR$data$doi)) | |
dois <- lapply(dois, function(z){ z[!is.na(z) & z != ""]})[[1]] | |
cat("ARXIV DOIS: ", length(resArxiv$doi), "CR DOIS: ", length(resCR$data$doi), "Total: ", length(dois)) | |
} | |
similarity_threshold <- 0.8 | |
is_new <- FALSE | |
if (!is.null(dois) && !is.na(dois)){ | |
j <- 1 | |
while(j <= length(dois)){ | |
doi <- dois[j] | |
cat("\nGetting data for DOI ", j, " of ", length(dois), ":\t", doi) | |
tryCatch({ | |
citation <- cr_cn(dois = doi, format = "bibtex", style="acm", locale="en-US") %>% | |
write(file = temp_file, append = FALSE) | |
tryCatch({ | |
new_bib_df <- bib2df(temp_file) | |
new_bib <- new_bib_df[1,] | |
new_bib$BIBTEXKEY <- bib_entry$BIBTEXKEY | |
new_title <- str_replace_all(tolower(new_bib$TITLE),"[:|{|}]", "") | |
old_title <- tolower(title) | |
titleSimilarity = levenshteinSim(new_title, old_title) | |
cat("\nNew Title: ", new_bib$TITLE, "\nTitle similarity: ", titleSimilarity, "\n") | |
if(titleSimilarity >= similarity_threshold){ | |
if(only_add_dois){ | |
cat("\nUpdating DOI...\n") | |
bib_entry$DOI <- new_bib$DOI | |
is_new <- FALSE | |
} | |
else{ | |
bib_entry <- merge_bibs(bib_entry, new_boib[1,]) | |
# bib_entry <- new_bib[1,] | |
is_new <- TRUE | |
} | |
break | |
} | |
}, | |
error = function(e){ | |
cat("\nError: ", paste0(e)) | |
}) | |
}, | |
error = function(e){ | |
cat("\nError: ", paste0(e)) | |
}) | |
j <- j + 1 | |
} | |
if(is_new){ | |
cat("\nWriting NEW bib...\n") | |
bib_entry$r_updated <- "YES" | |
} | |
else{ | |
cat("\nWriting OLD bib...\n") | |
bib_entry$r_updated <- "NO" | |
} | |
} | |
return(bib_entry) | |
} | |
USE_MULTI_THREAD <- FALSE | |
if(!USE_MULTI_THREAD){ | |
start_time <- Sys.time() | |
bib_df <- bib2df(filename) | |
cat("\nTotal refs: ", nrow(bib_df)) | |
bib_list_result <- foreach(i=9:20)%do% { | |
invisible(tidyUpBib(bib_df, i, out_file, temp_file, FALSE, FALSE)) | |
} | |
bib_df_result <- rbindlist(bib_list_result, fill=TRUE) | |
df2bib(bib_df_result, file = paste0(base_folder, "new_bib_single.bib")) | |
end_time <- Sys.time() | |
cat("Single threaded execution time: ", end_time - start_time) | |
} else { | |
start_time <- Sys.time() | |
bib_df <- bib2df(filename) | |
#setup parallel backend to use many processors | |
cores=detectCores() | |
cl <- makeCluster(cores[1]-1) #not to overload your computer | |
registerDoParallel(cl) | |
bib_list_result <- foreach(i=1:nrow(bib_df), .combine = 'rbind') %dopar% { | |
updated_bib = tidyUpBib(bib_df, i, out_file, temp_file, FALSE, TRUE) #calling a function | |
cat("Finished ", i, " / ", nrow(bib_df)) | |
#do other things if you want | |
updated_bib | |
} | |
#stop cluster | |
stopCluster(cl) | |
bib_df_result <- rbindlist(bib_list_result, fill=TRUE) | |
df2bib(bib_df_result, file = paste0(base_folder, "new_bib_multi.bib")) | |
end_time <- Sys.time() | |
cat("Multi threaded execution time: ", end_time - start_time) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment