|
##' Format author list, i.e. for TOPMed banner authorship |
|
##' |
|
##' @param input Either a .csv filename containing author information OR the file already read into memory |
|
##' @param namecol_idx Column index of author names, formatted as "Last, First Middle" |
|
##' @param affilcol_idxs Column indices of affiliation elements, in the order they should be listed (see \code{details}) |
|
##' @param excludecol_idx Column index of column indicating authors to exclude from output. If left NULL (default), no authors will be excluded. |
|
##' @param affil_sep Character that should separate affiliation elements |
|
##' @param affil_numstart Starting affiliation number, defaults to 1 |
|
##' @param out_filename Output filename (".html" file extension will be appended) |
|
##' @param alpha_sort if TRUE, sort author list alphabetically by name column; otherwise retain input order |
|
##' @param verbose if TRUE, print to screen the excluded author names |
|
##' |
|
##' @return Write out an html-formatted file with formatted author list and affiliations. |
|
##' |
|
##' @details In the \code{input}, an exclude column with any non-NA character will cause the author on that line to be excluded from the output line. I.e., indicate a paper's main authors with a "T" or "Y" in the column specified with the \code{excludecol_idx} argument. Output will be an html-formatted file containing author names with numeric superscripts corresponding to institutional details below. The html can then be copy/pasted into MS Word, or printed straight to pdf. Authors with multiple affiliations will be handled correctly if each affiliation is listed on a separate line. |
|
##' |
|
##' @examples |
|
##' \dontrun{ |
|
##' system("head banner_authors.csv") |
|
##' Name,Institution(s),City,State,Country,Zip,exclude, |
|
##' "Kaplan, Robert",Albert Einstein College of Medicine,New York,NY,US,10461,T, |
|
##' "Smoller, Sylvia",Albert Einstein College of Medicine,New York,NY,US,10461,, |
|
##' "Sheehan, Vivien",Baylor College of Medicine,Houston,TX,US,77030,, |
|
##' "Custer, Brian",Blood Systems Research Institute UCSF,San Francisco,CA,US,94118,T, |
|
##' "Kelly, Shannon",Blood Systems Research Institute UCSF,San Francisco,CA,US,94118,, |
|
##' "Konkle, Barbara",Blood Works Northwest,Seattle,WA,US,98104,, |
|
##' "Huston, Haley",Blood Works Northwest,Seattle,WA,US,98105,, |
|
##' "Johnsen, Jill",Blood Works Northwest,Seattle,WA,US,98106,, |
|
##' "Ruuska, Sarah",Blood Works Northwest,Seattle,WA,US,98107,, |
|
##' |
|
##' format_authors(input="banner_authors.csv", affilcol_idx=c(2,3,4,5,6), |
|
##' excludecol_idx=7, affil_numstart=20, out_filename="test") |
|
##' } |
|
##' |
|
##' @author Sarah Nelson, TOPMed DCC, \email{sarahcn@@uw.edu} |
|
##' @rdname format_author |
|
##' @export |
|
|
|
format_authors <- function(input, namecol_idx=1, affilcol_idxs, excludecol_idx=NULL, affil_sep=", ", affil_numstart=1, out_filename="authors_formatted", alpha_sort=TRUE, verbose=TRUE){ |
|
|
|
# required packages |
|
require(readr) |
|
require(tools) |
|
require(dplyr) |
|
require(magrittr) |
|
require(stringr) |
|
require(tidyr) |
|
|
|
# check whether input arg is filename or data.frame already in memory |
|
if(is.character(input)){ |
|
# read in author list - required to be csv |
|
stopifnot(tools::file_ext(input) %in% "csv") |
|
message("Reading in ", input) |
|
tab <- readr::read_csv(file=input, col_types=cols(.default = "c")) # character col types |
|
} else { |
|
tab <- input |
|
} |
|
|
|
if(!is.null(excludecol_idx)){ |
|
# identify excluded author names |
|
names(tab)[excludecol_idx] <- "exclude" |
|
nexclude <- sum(!is.na(tab$exclude)) |
|
message("Total of ", prettyNum(nexclude, big.mark=","), " excluded authors") |
|
if(verbose){ |
|
exclude_names <- paste(tab[!is.na(tab$exclude), namecol_idx]) |
|
message("List of excluded authors: ", exclude_names) |
|
} |
|
|
|
# remove excluded authors |
|
tab <- tab %>% |
|
dplyr::filter(is.na(exclude)) |
|
} |
|
|
|
names(tab)[namecol_idx] <- "name" |
|
|
|
if(alpha_sort) { |
|
# sort alphabetically |
|
tab.use <- tab %>% |
|
arrange(name) |
|
} else { |
|
tab.use <- tab |
|
} |
|
|
|
# combine affiliation elements together into one string |
|
naffil_elems <- length(affilcol_idxs) |
|
affil_colnames <- paste0("affil",1:naffil_elems) |
|
names(tab.use)[affilcol_idxs] <- affil_colnames |
|
|
|
# make table of unique affiliations |
|
tab.use$affil_str <- apply(tab.use[, affil_colnames], 1, paste , collapse=affil_sep) |
|
|
|
# if any affil elements were missing, remove the ", NA" or "NA, " strings |
|
# to do: figure out how to search and replace both strings at once |
|
tab.use <- tab.use %>% |
|
mutate(affil_str = stringr::str_replace_all(affil_str, ", NA", "")) %>% |
|
mutate(affil_str = stringr::str_replace_all(affil_str, "NA, ", "")) %>% |
|
dplyr::filter(!is.na(name)) # remove any rows w/o authors |
|
|
|
# make unique set of affiliations and add number |
|
aff <- tab.use %>% |
|
select(affil_str) %>% |
|
unique() %>% |
|
mutate(number=affil_numstart:((affil_numstart-1) + n())) %>% |
|
mutate(aff_write = paste(number, affil_str, sep = ' - ')) |
|
|
|
# preserve current order - account for where there are > 1 row per person |
|
tmp <- tab.use %>% |
|
select(name) %>% |
|
distinct() %>% |
|
mutate(srted=1:n()) |
|
|
|
tab.use$srted <- tmp$srted[match(tab.use$name, tmp$name)] |
|
|
|
# add affiliation number to author list |
|
aut <- tab.use %>% |
|
left_join(aff, by="affil_str") %>% |
|
# separate first and last author names |
|
tidyr::separate(name, into=c("last","first"), sep=", ", remove=FALSE) %>% |
|
select(first, last, number, srted) %>% |
|
# combine number for auts w/ mult institutions |
|
group_by(first, last, srted) %>% |
|
# order affil number from least to greatest in the superscript |
|
summarise(number_comb = paste(sort(number), collapse=",")) %>% |
|
mutate(number_sup = stringr::str_c("<sup>", number_comb, "</sup>")) %>% |
|
mutate(aut_final = stringr::str_c(first, " ", last, number_sup, sep="")) |
|
|
|
if(alpha_sort){ |
|
aut <- aut %>% |
|
arrange(last, first) |
|
} else { |
|
aut <- aut %>% |
|
arrange(srted) |
|
} |
|
|
|
# write out authors string |
|
write.table(paste(aut$aut_final, collapse=", "), |
|
file=paste0(out_filename,".html"), row.names=FALSE, |
|
col.names=FALSE, quote=FALSE) |
|
|
|
# write out affiliations string |
|
write.table(c("<br /><br />", paste(aff$aff_write, collapse="; ")), |
|
file=paste0(out_filename,".html"), row.names=FALSE, |
|
col.names=FALSE, quote=FALSE, append=TRUE) |
|
|
|
message("\nPlease see output file ", paste0(out_filename,".html"),"; the html can be copy/pasted into Microsoft Word, or printed directly to pdf. Both methods should preserve the superscript formatting.\n") |
|
} |
Add feature to take input as object read into memory vs. just file name read from disk