Skip to content

Instantly share code, notes, and snippets.

@IdoBar
Last active October 12, 2023 05:37
Show Gist options
  • Save IdoBar/7f63547158ecdbacf31b54a58af0d1cc to your computer and use it in GitHub Desktop.
Save IdoBar/7f63547158ecdbacf31b54a58af0d1cc to your computer and use it in GitHub Desktop.
A set of utilities to make easier R working environment
# util.R:
# Utilities to make R a happier place
# Originally created by Brendan O'Connor, brenocon.com/code
# Edited by Ido Bar.
# Source the file directly from this gist using `devtools::source_gist("7f63547158ecdbacf31b54a58af0d1cc", filename = "util.R")`
########################################
## Put everything into an environment, to not pollute global namespace
util = new.env()
########################################
## Better I/O routines
util$repath <- function(x, to=.Platform$OS.type) {
xa <- ifelse(to=="windows", gsub('/', '\\\\', x), gsub('\\\\', '/', x))
return(xa)
}
# search file recursively in a path
util$search.path <- function(fn,
paths = strsplit(chartr("\\", "/", Sys.getenv("RPATH")), split =
switch(.Platform$OS.type, windows = ";", ":"))[[1]]) {
for(d in paths)
if (file.exists(f <- file.path(d, fn)))
return(f)
return(NULL)
}
# Find recent file in a folder (based on pattern to search)
# Usage: recent_file("./data/intermediate_files", glue::glue(".*{stacks_name}.+\\.clean.vcf"))
util$recent_file <- function(filedir, pattern, sort_by="modified"){
files <- file.info(dir(filedir, pattern, full.names = TRUE))
files$filename <- rownames(files)
# files[[paste0(substr(sort_by,1,1), "time")]] <- as.POSIXct(files[[paste0(substr(sort_by,1,1), "time")]])
files[order(files[[paste0(substr(sort_by,1,1), "time")]] , decreasing = TRUE),"filename"][1]
}
# deprecated in favour of readr::read_tsv() function from tidyverse
# util$read.tsv <- function(..., header=F, sep='\t', quote='', comment='', na.strings='', stringsAsFactors=FALSE) {
# read.table() wrapper with default settings for no-nonsense, pure TSV
# Typical use case is output from another program.
# (R's defaults are more geared for human-readable datafiles, which is less
# feasible for large-scale data anyway.)
# These options are substantially faster than read.table() defaults.
# (see e.g. LINK)
# stringsAsFactors is the devil.
# args = list(...)
# args$header = headp
# if (!is.null(args$col.names)) {
# read.delim() is not smart about this. Yikes.
# args$header = FALSE
# }
# args$sep = sep
# args$quote = quote
# args$comment = comment
# args$stringsAsFactors = stringsAsFactors
# args$na.strings = na.strings
# do.call(read.delim, args)
# }
# deprecated in favour of readr::write_tsv() function from tidyverse
# util$write.tsv <- function(..., header=NA, col.names=F, row.names=F, sep='\t', na='', quote=F) {
# 'header' to 'col.names' naming consistency with read.table()
# if (is.finite(header)) col.names = header
# write.table(..., col.names=col.names, row.names=row.names, sep=sep, na=na, quote=quote)
# }
# write fasta files
util$write_fasta <- function(id_seq_table, line_width=60, filename){
file.create(filename)
apply(id_seq_table, 1, function(entry) cat(sprintf(">%s\n%s\n\n", entry[1],
gsub(sprintf("([GTAC]{%i})",line_width), "\\1\n", entry[2])),
file = filename, append = TRUE))
if (file.exists(filename)) return(TRUE)
else {
message(sprintf("Unable to create the file %s, please check that the path exists and is accessible", filename))
return(FALSE)
}
}
# improved implementation of xlsx::write.xlsx (allows overwrite of sheets and return the data as tibble, allowing it to be part of tidyverse pipes)
util$write_xlsx <- function(df, excel_file, sheet="Sheet1", overwritesheet=FALSE, overwritefile = TRUE,
replacefile = FALSE, asTable=TRUE, return.tibble=TRUE,...){
if (!'openxlsx' %in% row.names(installed.packages())) stop("Package 'openxlsx' is required and not available, please install it and try again")
df <- as.data.frame(df)
if (!file.exists(excel_file) | isTRUE(replacefile)) {
unlink(excel_file)
openxlsx::write.xlsx(df, excel_file, sheetName = sheet, rowNames=FALSE, asTable=asTable, ...)
if (isTRUE(return.tibble) & ('tibble' %in% row.names(installed.packages()))) df <- tibble::as_tibble(df)
return(invisible(df))
}
if (!isTRUE(overwritefile)) stop(sprintf("File %s exists and 'overwritefile' set to FALSE.", excel_file))
wb <- openxlsx::loadWorkbook(excel_file)
sheets <- openxlsx::getSheetNames(excel_file)
if (sheet %in% sheets){
if (overwritesheet==FALSE) stop(sprintf("Sheet %s exists in file and 'overwritesheet' set to FALSE, please try again with a new sheet name or with 'overwritesheet=TRUE' option", sheet))
openxlsx::removeWorksheet(wb, sheet)
}
openxlsx::addWorksheet(wb, sheet)
if (asTable==TRUE){
openxlsx::writeDataTable(wb, sheet, df)
} else {
openxlsx::writeData(wb, sheet, df)
}
openxlsx::saveWorkbook(wb, excel_file, overwrite = overwritefile)
# openxlsx::write.xlsx(df, excel_file, sheetName = sheet, row.names=FALSE, ...)
if (isTRUE(return.tibble) & ('tibble' %in% row.names(installed.packages()))) df <- tibble::as_tibble(df)
return(invisible(df))
}
# extract hyperlinks from excel sheets (see https://stackoverflow.com/a/70676505/5346827)
util$hyperlinks_from_excel <- function(aExcelFile, ExcelSheet="all",
aRefOutputFile = NULL){
req_pkgs <- c("XML", "purrr", "dplyr", "readxl")
if (!'rlang' %in% row.names(installed.packages())) stop("'rlang' package is not installed, please install it and try again")
# if (!"rlang" %in% installed.packages() require(XML)) stop("XML package is not installed")
rlang::check_installed(req_pkgs,
reason = "Package XML is required to extract the urls from the sheets, while purrr and dplyr are needed to process the results")
# ExcelSheet="submission_info"
# aExcelFile="sample_info/AGRF_NGS_yield.xlsx"
sheets <- readxl::excel_sheets(aExcelFile)
read_relationships <- function(aSheetIndex, tmp_dir){
# tmp_dir= tmp_base_dir
# aSheetIndex = 2
filename <- file.path(tmp_dir, 'xl', 'worksheets', '_rels', paste0('sheet', aSheetIndex, '.xml.rels'))
rel <- XML::xmlParse(filename) %>% XML::xmlToList() %>% purrr::map_dfr(as.list) %>%
dplyr::filter(grepl("hyperlink$", Type)) %>%
dplyr::select(id=Id, target=Target) #%>% setNames()
if(nrow(rel) == 0){
return(NULL)
}
filename <- file.path(tmp_dir, 'xl', 'worksheets', paste0('sheet', aSheetIndex, '.xml'))
pos <- XML::xmlParse(filename) %>% XML::xmlToList()
if(is.null(pos$hyperlinks)){
return(NULL)
}
ret <- purrr::map_dfr(pos$hyperlinks, as.list) %>%
# dplyr::select(-uid) %>%
dplyr::inner_join(rel, by = 'id') %>%
dplyr::mutate(tab_idx=aSheetIndex)
link_text <- ret %>% dplyr::select(text=display, target) %>%
dplyr::filter(!is.na(text)) %>%
dplyr::distinct()
return(ret %>% left_join(link_text))
}
EXCEL_TEMP_NAME <- 'unzipped_excel'
tmp_base_dir <- file.path(tempdir(),
paste0('tmpexcl',
as.character(round(runif(1, 1000000000000, 9999999999999)))))
dir.create(tmp_base_dir)
on.exit(unlink(tmp_base_dir))
zipfile <- file.path(tmp_base_dir, paste0(EXCEL_TEMP_NAME, '.zip'))
file.copy(from = aExcelFile, to = zipfile)
unzip(zipfile, exdir = tmp_base_dir)
sheet_idx <- NULL
# ExcelSheet=sheets[1]
if (is.numeric(ExcelSheet) & all(ExcelSheet %in% seq_along(sheets))) {
sheet_idx <- ExcelSheet
} else if (is.character(ExcelSheet) & all(ExcelSheet %in% sheets)) {
sheet_idx <- which(sheets %in% ExcelSheet)
} else if (tolower(ExcelSheet)=="all") {
sheet_idx <- seq_along(sheets)
}
if (length(sheet_idx)==0) stop("Sheets provided to 'ExcelSheet' argument not present in the Excel file")
res <- purrr::map_dfr(sheet_idx, ~read_relationships(.x, tmp_base_dir)) %>%
dplyr::mutate(sheet = sheets[tab_idx]) %>%
dplyr::mutate(ref = paste0("'", sheet, "'!", ref)) %>%
dplyr::select(id,tab_idx, sheet, ref, target, text)
if(!is.null(aRefOutputFile)){
write_csv(res, aRefOutputFile)
}
return(res)
}
########################################
## Misc small routines
# calculates standard error
util$se <- function(x, na.rm=TRUE) {
if(na.rm) return(sqrt(var(x, na.rm=TRUE)/length(x[!is.na(x)]))) else return(sqrt(var(x)/length(x)))
}
# calculate geometric mean (see https://stackoverflow.com/a/25555105/5346827)
util$gm_mean = function(x, na.rm=TRUE){
exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}
util$as.c <- as.character
# split vector into chunks (see https://stackoverflow.com/questions/3318333/split-a-vector-into-chunks)
util$chunk <- function(x, n) (mapply(function(a, b) (x[a:b]), seq.int(from=1, to=length(x), by=n), pmin(seq.int(from=1, to=length(x), by=n)+(n-1), length(x)), SIMPLIFY=FALSE))
util$chunk2 <- function(x,n) split(x, cut(seq_along(x), n, labels = FALSE))
util$unwhich <- function(indices, len=length(indices)) {
# reverse of which(): from indices to boolean mask.
ret = rep(F,len)
ret[indices] = T
ret
}
util$nna <- function(...) !is.na(...) # i type this a lot, i think its worth 3 characters + shift key
util$shuffle <- function(...) UseMethod("shuffle")
util$shuffle.default <- function(x) x[order(runif(length(x)))]
util$shuffle.data.frame <- function(x) x[order(runif(nrow(x))),]
util$sample_df <- function(d, size=10, ...) {
samp = sample(1:nrow(d), size=size, ...)
d[samp,]
}
# split a dataframe into subsets by size
util$split_df <- function(df, group_size){
split(df, gl(ceiling(nrow(df)/group_size), group_size, nrow(df)))
}
util$present_levels <- function(x) intersect(levels(x), x)
util$trim_levels <- function(...) UseMethod("trim_levels")
util$trim_levels.factor <- function(x) factor(x, levels=present_levels(x))
util$trim_levels.data.frame <- function(x) {
for (n in names(x))
if (is.factor(x[,n]))
x[,n] = trim_levels(x[,n])
x
}
util$unfactor <- function(df){
id <- sapply(df, is.factor)
df[id] <- lapply(df[id], as.character)
df
}
# grep() returns indices of matches. Variants:
util$bgrep <- function(pat,x, ...) {
# "boolean" grep: return a logical vector ready for vector ops
# like & | and others
unwhich(grep(pat,x,...), length(x))
}
util$ngrep <- function(pat,x, ...)
# "normal" grep: return values, not indices
x[grep(pat,x,...)]
########################################
## Other data manipulation routines
util$tidy_transpose <- function(df) {
df %>% t() %>% # Tranpose, but function is for matrices. Return Matrix
as.data.frame() %>% #Force to be dataframe
tibble::rownames_to_column(var = "rowname") %>% #Resave first column from rownames
janitor::row_to_names(row_number = 1) #Resave column headers from first row.
}
util$tapply2 <- function(x, ...) {
# like tapply but preserves factors
if (is.factor(x)) {
r = factor(tapply(as.character(x), ...), levels=levels(x))
} else {
r = tapply(x, ...)
}
r
}
util$inject <- function(collection, start, fn) {
# like lisp reduce. (named after ruby)
acc = start
for (x in collection)
acc = fn(acc, x)
acc
}
##### Bioinformatics Annotation #######
# Shortens and "prettify" long BLAST descriptions
util$pretty_term <- function(term, prefix="Predicted:", comma=FALSE, dot=FALSE, paren=FALSE, cap=TRUE, firstCapOnly=FALSE,
spDot=TRUE, charNum=0, showTrimmed=FALSE) {
term <- sapply(term, function (s) {
s_ <- s
if (is.character(prefix) && length(prefix)>0) s_ <- sub(sprintf("^%s\\s*",prefix) ,"", s_, ignore.case = TRUE, perl=TRUE)
if (cap) s_ <- sub("(^.)", "\\U\\1", s_, perl=TRUE)
if (firstCapOnly) s_ <- paste(c(unlist(strsplit(s_, " ", fixed = TRUE))[1], sapply(unlist(strsplit(s_, " ", fixed = TRUE))[-1], function(n) sub("(^.)", "\\L\\1", n, perl=TRUE), USE.NAMES = FALSE)), collapse = " ")
if (spDot) s_ <- sub("sp\\. (.+?)\\.(.+)", "sp. \\1_\\2", s_, perl=TRUE)
if (paren) s_ <- sub(" \\(.+\\)", "", s_)
if (dot) s_ <- unlist(strsplit(s_, ".", fixed = TRUE))[1]
if (comma) s_ <- unlist(strsplit(s_, ",", fixed = TRUE))[1]
# Trim to the end of the word if character number limit is reached and remove any trailing commas or dots
if (charNum>0) s_ <- sub("[,|\\.]$", "", sub(sprintf("(^.{%s}[\\S]*)[ ]*.*", charNum), "\\1", s_, perl=TRUE), perl = TRUE)
# Add a ^ at the end if the term was trimmed anyhow
if (nchar(s_)<nchar(s) && !grepl("\\*$", s_) && showTrimmed) return(paste0(s_, "^"))
else return(s_)
}, USE.NAMES = FALSE)
return(term)
}
# Find unique Tax_name
util$find_uniq_tax <- function(desc, taxName, prettify=FALSE) {
if (is.na(taxName) || taxName=="N/A") uniqTax <- ifelse((is.na(desc) || !grepl(".+\\[(.*)\\]", desc)), NA, sub(".+\\[(.*)\\]","\\1", desc))
else {
if (grepl(";", taxName)) {
taxList <- unlist(strsplit(taxName, ";"))
taxDesc <- sub(".+\\[(.*)\\]","\\1", desc)
uniqTax <- ifelse(taxDesc %in% taxList, taxDesc, taxList[length(taxList)])
} else uniqTax <- taxName
}
if (prettify) uniqTax <- pretty_term(uniqTax, prefix="Undetermined", comma=FALSE, dot=FALSE, paren=FALSE, cap=TRUE, spDot=TRUE, charNum=0, showTrimmed=FALSE)
return(uniqTax)
}
# Resolve scientific name (with taxize package)
util$resolve_scientific <- function(tax_names) {
tax_count=0
resolved <- lapply(tax_names, function(x) {
tax_count <<- tax_count + 1
perc <- tax_count/length(tax_names)*100
message(sprintf("Processing %s (%.1f%% completed)", x, perc))
res_name <- try(taxize::gnr_resolve(names = x, data_source_ids=4, canonical = TRUE,best_match_only=FALSE ), TRUE)
if (!inherits(res_name, "try-error")) return(res_name$results[which.max(res_name$results$score),])
else {
message(sprintf("Warning: Could not process %s correctly, returning NAs", x))
return(data.frame(submitted_name=x,matched_name=NA,data_source_title=NA,score=NA))
}
} )
return(resolved)
}
# Define a function that accepts a matrix of DE contrasts and DESeq data to analyse each contrast
util$get_deseq2_res <- function(dds_deseq2, mat_col, group_col = "sample_group") {
contrast <- c(group_col,rev(mat_col))
comparison <- paste0(rev(mat_col), collapse="_vs_")
res <- DESeq2::results(dds_deseq2, contrast=contrast, tidy = TRUE) %>% as_tibble() %>%
dplyr::rename(gene_id=row) %>%
dplyr::mutate(contrast=comparison, analysis_date=analysis_date) %>%
dplyr::filter(!is.na(padj)) %>% dplyr::arrange(padj, desc(abs(log2FoldChange)))
return(res)
}
# Modify plotPCA to return all 4 PCs
util$plotPCA4 <- function (dds_deseq2, intgroup = "condition", ntop = 500, PCs=3:4, returnData = FALSE){
rv <- genefilter::rowVars(SummarizedExperiment::assay(dds_deseq2))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop,
length(rv)))]
pca <- prcomp(t(SummarizedExperiment::assay(dds_deseq2)[select, ]))
percentVar <- pca$sdev^2/sum(pca$sdev^2)
if (!all(intgroup %in% names(SummarizedExperiment::colData(dds_deseq2)))) {
stop("the argument 'intgroup' should specify columns of colData(dds)")
}
intgroup.df <- as.data.frame(SummarizedExperiment::colData(dds_deseq2)[, intgroup,
drop = FALSE])
group <- if (length(intgroup) > 1) {
factor(apply(intgroup.df, 1, paste, collapse = " : "))
}
else {
SummarizedExperiment::colData(dds_deseq2)[[intgroup]]
}
d <- data.frame(PC1 = pca$x[, 1], PC2 = pca$x[, 2], PC3 = pca$x[, 3], PC4 = pca$x[, 4], group = group,
intgroup.df, name = colnames(dds_deseq2))
if (returnData) {
attr(d, "percentVar") <- percentVar[1:4]
return(d)
}
ggplot2::ggplot(data = d, aes_string(x = paste0("PC", PCs[1]), y = paste0("PC", PCs[2]), color = "group")) +
ggplot2::geom_point(size = 3) +
ggplot2::labs(x = sprintf("PC%d: %.2f%% variance", PCs[1], percentVar[PCs[1]]*100),
y = sprintf("PC%d: %.2f%% variance", PCs[2], percentVar[PCs[2]]*100)) + ggplot2::coord_fixed()
# + ylab(paste0("PC4: ", round(percentVar[2] * 100), "% variance"))
}
########################################
## Printing, viewing
## see also: str()
## Transparent colors
## Mark Gardener 2015
## www.dataanalytics.org.uk
util$t_col <- function(color, percent = 50, name = NULL) {
# color = color name, percent = % transparency, name = an optional name for the color
## Get RGB values for named color
rgb.val <- col2rgb(color)
## Make new color using input color as base and alpha set by transparency
t.col <- rgb(rgb.val[1], rgb.val[2], rgb.val[3],
max = 255,
alpha = (100-percent)*255/100,
names = name)
## Save the color
invisible(t.col)
}
util$printf <- function(...) cat(sprintf(...))
util$listprint <- function(x) {
s = paste(sapply(names(x), function(n) sprintf("%s=%s", n,x[[n]])), collapse=' ')
printf("%s\n", s)
}
util$msg <- function(...) cat(..., "\n", file=stderr())
util$LogMsg <- function(text, dateformat="%d-%m-%Y %I:%M:%S", ...) {
if (is.null(dateformat) | (dateformat==FALSE)) return(cat(paste0(text, "\n")))
msg <- sprintf(paste0(format(Sys.time(), dateformat), ": ", text, "\n"), ...)
cat(msg)
}
util$h = utils::head
util$ppy <- function(x, column.major=FALSE, ...) {
# pretty-print as yaml. intended for rows with big textual cells.
# a la mysql's \G operator
library(yaml)
cat(as.yaml(x, column.major=column.major), ...)
cat("\n", ...)
}
util$table_html = function(...) {
# Intended for inside dosink()
columns = list(...)
ncol = length(columns)
nrow = length(columns[[1]])
# assume columns are in parallel
printf("\n<table cellpadding=3 border=1 cellspacing=0 bordercolor=gray>")
for (i in 1:nrow) {
printf("\n<tr>")
for (j in 1:ncol)
printf("\n <td>%s", columns[[j]][i])
}
printf("\n</table>\n")
}
########################################
## Workspace management
# Fix devtools
util$.fixdevtools <- function() {
dev_reqs <- c("httr", "devtools")
install.deps(dev_reqs)
httr::set_config( httr::config( ssl_verifypeer = 0L ) )
options(buildtools.check = function(action) TRUE )
}
# Silently load a package, can change mode to "require"
util$silent.load <- function(a.package, mode="library"){
eval(parse(text=sprintf("suppressWarnings(suppressPackageStartupMessages(
%s(a.package, character.only=TRUE)))", mode)))
}
# install and load packages from CRAN, Bioconductor or git, see usage below
util$install.deps <- function(p, repo="cran", reposite=getOption("repos"), silent=FALSE, force=FALSE){
# repos=getOption("repos")
if (is.null(reposite)) reposite <- "http://cloud.r-project.org/"
call_install <- switch(repo,
cran=sprintf("install.packages(package, repos=\"%s\"", reposite),
bioc="biocLite(package, suppressUpdates=TRUE, ask=FALSE",
git="install_github(package")
if (repo=="git") .fixdevtools()
if (repo=="bioc" & !all(p %in% utils::installed.packages())) {
#source("http://bioconductor.org/biocLite.R")
if (!"RCurl" %in% utils::installed.packages()) {
install.packages("RCurl", repos=reposite)
}
eval(parse(text = RCurl::getURL("http://bioconductor.org/biocLite.R",
ssl.verifypeer=FALSE)))
biocLite(suppressUpdates=TRUE, ask=FALSE)
}
for (package in p) {
gitNames <- unlist(strsplit(package, "/"))
pName <- gitNames[length(gitNames)]
if (!pName %in% utils::installed.packages() | isTRUE(force)) {
cat(sprintf("Please wait, installing and loading missing package %s and its dependencies from %s.\n",
package, repo), file=stderr())
suppressWarnings(eval(parse(text = sprintf("%s, quiet = TRUE, lib=.libPaths()[1])",call_install))))
if (!pName %in% utils::installed.packages()) {
ifelse(!interactive(),q("no", 2, TRUE),
stop(sprintf("Unable to install missing package %s from %s.\n",
package, repo), call. = FALSE))
}
}
if (isTRUE(silent)) silent.load(pName, mode="require")
else require(pName, character.only=T, quietly=TRUE, warn.conflicts=TRUE)
}
}
# CRAN_packages <- c("RCurl", "dplyr", "tidyr", "ggplot2", "RColorBrewer", "RSQLite")
# install.deps(CRAN_packages)
# improved list of objects
# http://stackoverflow.com/questions/1358003/tricks-to-manage-the-available-memory-in-an-r-session
util$list_objects = function (pos = 1, pattern) {
napply = function(names, fn) sapply(names, function(x)
fn(get(x, pos = pos)))
names = ls(pos = pos, pattern = pattern)
N = length(names)
obj_class = napply(names, function(x) as.character(class(x))[1])
obj_mode = napply(names, mode)
obj_type = ifelse(is.na(obj_class), obj_mode, obj_class)
obj_prettysize = napply(names, function(x) {
capture.output(print(object.size(x), units = "auto")) })
obj_size = napply(names, object.size)
obj_prettysize[obj_size < 1e6] = ""
obj_length = napply(names, function(x) length(x))
obj_dim = t(napply(names, function(x)
as.numeric(dim(x))[1:2]))
is_flat = is.na(obj_dim)[, 1]
is_vector = napply(names, function(x) is.vector(x) & class(x) != 'list')
info_width = max(20, options('width')$width - 60)
small_str = function(x) {
out = capture.output(
str(x, max.level=0, give.attr=F, give.head=F, width=info_width, strict.width='cut')
)
out = str_c(out,collapse=' ')
out = cutoff(str_replace(out,"\n"," "))
if (str_detect(out, "^List of"))
out = str_c("[Names] $ ", str_c(names(x),collapse=' '))
cutoff(out)
}
cutoff = function(s) {
if (str_length(s) >= info_width) {
str_c(str_sub(s,1,info_width-2),'..')
} else {
s
}
}
pad = function(s) sprintf(" %s", s)
out <- data.frame(
Type = obj_type,
Size = obj_prettysize,
Dim = ifelse(is_vector | is_flat, obj_length,
sprintf("(%s, %s)", obj_dim[,1], obj_dim[,2])),
Value = napply(names, function(x)
if (class(x) %in% c('data.frame','list') && !is.null(names(x)))
cutoff(str_c("[Names] $ ",str_c(names(x), collapse=' ')))
else small_str(x)
),
stringsAsFactors=F)
row.names(out) = names
out$Dim = sprintf(" %s", out$Dim)
out$Value = sprintf(str_c(" %-", info_width, "s"), out$Value)
out = rbind(subset(out, Type!='function'), subset(out, Type=='function'))
out
}
util$lsos = function() {
d = list_objects()
d$name = row.names(d)
d = subset(d, name != 'util')
row.names(d)=d$name
d$name=NULL
d
}
########################################
## For performance optimization and long-running jobs
util$timeit <- function(expr, name=NULL) {
# print how long the expression takes, and return its value too.
# So you can interpose timeit({ blabla }) around any chunk of code "blabla".
start = Sys.time()
ret = eval(expr)
finish = Sys.time()
if (!is.null(name)) cat(name,": ")
print(finish-start)
invisible(ret)
}
util$dotprogress <- function(callback, interval=10) {
# intended to wrap the anonymous callback for sapply() or somesuch.
# ALTERNATIVE: plyr *ply(.progress='text')
count = 0
return(function(...) {
if ((count <<- count+1) %% interval == 0)
cat(".")
callback(...)
})
}
########################################
## External programs for interactivity
########################################
## Graphics output wrappers
## For easy one-liners, like:
## dopdf("tmp.pdf",width=5,height=5,cmd=plot(x,y))
util$dopdf <- function(filename,..., cmd) {
pdf(filename, ...)
eval(cmd)
dev.off()
if (exists('OPEN') && OPEN)
system(sprintf("open %s", filename))
}
util$dopng <- function(filename,..., cmd) {
png(filename, ...)
eval(cmd)
dev.off()
if ((exists('OPEN') && OPEN))
system(sprintf("open %s", filename))
}
util$dosink <- function(filename,cmd, open=NULL) {
# like capture.output() but follows open/OPEN conventions here
sink(filename)
eval(cmd)
sink(NULL)
if (prio_check(open, exists('OPEN') && OPEN))
system(sprintf("open %s", filename))
}
util$dosvg <- function(filename, ..., cmd, open=NULL) {
require("RSvgDevice")
devSVG(filename, ...)
eval(cmd)
dev.off()
if (prio_check(open, exists('OPEN') && OPEN))
system(sprintf("open %s", filename))
}
# A default plotting theme
util$plot_theme <- function(def_theme="grey", baseSize=22, strip_fill="lightskyblue"){
require("ggplot2")
my_theme <- eval(parse(text = sprintf("theme_%s(base_size=%d)", def_theme,baseSize))) +
theme(axis.title.y=element_text(face="bold", vjust = 1.5, size=rel(0.8)),
axis.title.x=element_text(face="bold", vjust = 0.1, size=rel(0.8)),
legend.title=element_text(size=rel(0.85), hjust=0.5),
legend.text=element_text(size = rel(0.75),lineheight = 1.5),
axis.text.y=element_text(size=rel(0.85)),
axis.text.x=element_text(size=rel(0.85)),
#panel.grid.minor=element_blank(),
strip.text=element_text(size=rel(0.75)),
strip.background=element_rect(fill = strip_fill))
return(my_theme)
}
# plot correlation matrix (https://statsandr.com/blog/correlation-coefficient-and-correlation-test-in-r/)
# do not edit
util$corrplot2 <- function(data,
method = "pearson",
sig.level = 0.05,
order = "original",
diag = FALSE,
type = "upper",
tl.srt = 90,
number.font = 1,
number.cex = 1,
mar = c(0, 0, 0, 0)) {
require("corrplot")
data_incomplete <- data
data <- data[complete.cases(data), ]
mat <- cor(data, method = method)
cor.mtest <- function(mat, method) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat <- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], method = method)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
p.mat <- cor.mtest(data, method = method)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(mat,
method = "color", col = col(200), number.font = number.font,
mar = mar, number.cex = number.cex,
type = type, order = order,
addCoef.col = "black", # add correlation coefficient
tl.col = "black", tl.srt = tl.srt, # rotation of text labels
# combine with significance level
p.mat = p.mat, sig.level = sig.level, insig = "blank",
# hide correlation coefficients on the diagonal
diag = diag
)
}
# This function makes a pretty ggplot theme (https://ourcodingclub.github.io/tutorials/dataviz-beautification-synthesis/#distributions)
# This function takes no arguments
# meaning that you always have just niwot_theme() and not niwot_theme(something else here)
util$theme_niwot <- function(baseSize=16){
theme_bw(base_size=baseSize) +
theme(text = element_text(family = "Helvetica Light"),
axis.text = element_text(size = rel(0.85)),
# axis.title = element_text(size = 18),
axis.line.x = element_line(color="black"),
axis.line.y = element_line(color="black"),
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
plot.margin = unit(c(1, 1, 1, 1), units = , "cm"),
plot.title = element_text(vjust = 1, hjust = 0),
legend.text = element_text(size = rel(0.85)),
legend.title = element_blank(),
legend.position = c(0.95, 0.15),
legend.key = element_blank(),
legend.background = element_rect(color = "black",
fill = "transparent",
size = 2, linetype = "blank"))
}
# Create a filename with folder and current date/time before the extension
util$filedate <- function(filename, ext="pdf", outdir=".", dateformat="%d_%m_%Y"){
if (outdir!="." & !file.exists(outdir)) dir.create(outdir, recursive = TRUE)
ext <- sub("\\.*(.+)", "\\1", ext)
if (is.null(dateformat)) return(file.path(outdir, sprintf("%s.%s", filename, ext)))
if (dateformat==FALSE) return(file.path(outdir, sprintf("%s.%s", filename, ext)))
return(file.path(outdir, sprintf("%s_%s.%s", filename, format(Sys.time(), dateformat), ext)))
}
########################################
## Plotting routines
# Create nice log axis breaks (see package "scales")
util$log_breaks <- function(n = 10, minor=TRUE, major=TRUE, pretty=FALSE){
function(x) {
b <- axisTicks(log10(range(x, na.rm = TRUE)), log = TRUE, n = n)
if (!minor) b <- b[grepl("1", prettyNum(b))]
if (!major) b <- b[grepl("5", prettyNum(b))]
if (pretty) b <- prettyNum(b)
return(b)
}
}
# Create log axis labels
util$base_breaks <- function(n = 10){
function(x) {
axisTicks(log10(range(x, na.rm = TRUE)), log = TRUE, n = n)
}
}
# A function factory for getting integer y-axis values.
util$integer_breaks <- function(n = 5, ...) {
fxn <- function(x) {
breaks <- floor(pretty(x, n, ...))
names(breaks) <- attr(breaks, "labels")
breaks
}
return(fxn)
}
util$linelight <- function(x,y, lty='dashed', col='lightgray', ...) {
# highlight a point with lines running to the axes.
left = par('usr')[1]
bot = par('usr')[3]
segments(left,y, x,y, lty=lty, col=col, ...)
segments(x,bot, x,y, lty=lty, col=col, ...)
}
util$hintonplot <- function(mat, max_value=max(abs(mat)), mid_value=0, ...) {
# Plots a matrix as colored, size-varying boxes
# I dunno who started calling this a "Hinton plot", but anyways
# Example:
# hintonplot(matrix(rnorm(100),10))
# Example, for counts:
# table(cyl=mtcars$cyl, mpg=cut(mtcars$mpg,3))
# mpg
# cyl (10.4,18.2] (18.2,26.1] (26.1,33.9]
# 4 0 6 5
# 6 2 5 0
# 8 12 2 0
# hintonplot(table(cyl=mtcars$cyl, mpg=cut(mtcars$mpg,3)))
plot.new()
plot.window(xlim=c(0.5,ncol(mat)+0.5), ylim=c(0.5,nrow(mat)+0.5))
x_mid = 1:ncol(mat)
y_mid = 1:nrow(mat)
area = abs(mat) / max_value
side = sqrt(area)
for (x in 1:ncol(mat)) {
for (y in nrow(mat):1) {
# ym = (nrow(mat):1)[y]
ym = y
d = side[ym,x] / 2
rect(x-d, y-d, x+d, y+d, col=if (mat[ym,x]>0) 'darkblue' else 'darkred')
}
}
axis(1, 1:ncol(mat), labels=colnames(mat))
# axis(2, nrow(mat):1, labels=row.names(mat))
axis(2, 1:nrow(mat), labels=row.names(mat))
title(xlab=names(dimnames(mat))[2], ylab=names(dimnames(mat))[1], ...)
}
########################################
## Rmarkdown functions
# make captions for DT tables
util$DT_caption <- function(ref, caption, tab_width="100%"){
# Do not use underscores or spaces in references!
if (grepl("[_ ]", ref)) stop("Do not use underscores (_) or spaces ( ) in figure or table reference!")
cat(sprintf("<table width=%s>",tab_width) ,
sprintf("<caption>(#tab:%s)%s</caption>", ref, caption),"</table>",
sep ="\n")
}
########################################
## Has to be last in file
while("util" %in% search())
detach("util")
attach(util)
@IdoBar
Copy link
Author

IdoBar commented Jun 27, 2018

Fixed filedate() to recursively create folder names

@IdoBar
Copy link
Author

IdoBar commented Jun 4, 2019

Remove dependency on xlsx package (requires Java) in favor of openxlsx in write_xslx()

@IdoBar
Copy link
Author

IdoBar commented Nov 23, 2019

Improved functionality of write_xlsx() (check dependencies, optional conversion to tibble)

@IdoBar
Copy link
Author

IdoBar commented May 29, 2022

Added Correlation plot function corrplot2() (see source)

@IdoBar
Copy link
Author

IdoBar commented Apr 27, 2023

Added split_df() to split a dataframe into equal-sized subsets (returns a list of dataframes).
See source on SO

@IdoBar
Copy link
Author

IdoBar commented Sep 1, 2023

Added chunk() to split a vector to chunks (very useful for working with purrr).
See source on SO

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment