Skip to content

Instantly share code, notes, and snippets.

@alekrutkowski
Last active June 15, 2016 12:45
Show Gist options
  • Save alekrutkowski/6a5fa9b610f260c9199e61b571dfdd0c to your computer and use it in GitHub Desktop.
Save alekrutkowski/6a5fa9b610f260c9199e61b571dfdd0c to your computer and use it in GitHub Desktop.
A simple caching system for R (persistent, disk-based)
library(magrittr)
stopifnot('digest' %in% installed.packages()[,"Package"])
if (!dir.exists('.cache.db')) dir.create('.cache.db') # in the current working directory
# Public API --------------------------------------------------------------
cachedCall <-
function(`fun*`,
..., # file paths/names in args in ... need to be wrapped in File()
`*cache_dir*`=paste0(getwd(),'/.cache.db/'),
`*load_cached*`=FALSE) {
# Can both create/return CachedResult and
# absorb in ... (as plain/unwrapped arguments) previous CachedResult(s).
# `fun*` should be a pure function that always returns the same result value given the same arguments
# (see https://en.wikipedia.org/wiki/Pure_function)
signat <- list(`fun*`, ...) %>%
lapply(extractSignat) %>%
digest::digest()
fname <- paste0(`*cache_dir*`,
signat,
'.Rds')
`if`(file.exists(fname),
`if`(`*load_cached*`,
list(signat=signat,
val=readRDS(fname)),
list(signat=signat)) %>%
addClass('CachedResult'),
list(...) %>%
lapply(extractVal) %>%
do.call(`fun*`, .) %T>%
saveRDS(fname) %>%
list(signat=signat,
val=.) %>%
addClass('CachedResult'))
}
extractVal <- function(arg,
`*cache_dir*`=paste0(getwd(),'/.cache.db/'))
# CachedResult(s) need to be wrapped in this function
# if used outside cachedCall as arguments in other functions
`if`(arg %>% inherits('CachedResult'),
`if`(arg %>% containsVal,
arg$val,
readRDS(paste0(`*cache_dir*`,
arg$signat,
'.Rds'))),
arg)
# Helpers/private ---------------------------------------------------------
extractSignat <- function(elem)
cond(elem %>% inherits('CachedResult'),
elem$signat,
elem %>% inherits('File'), # file paths/names in args in ... in cachedCall need to be wrapped in File()
digest::digest(elem, file=TRUE),
digest::digest(elem))
File <- function(path)
path %>%
addClass('File')
addClass <- function(obj, ClassName)
`class<-`(obj,
c(ClassName,
class(obj)))
containsVal <- function(CachedResult)
'val' %in% names(CachedResult)
cond <- function(...) {
e <- parent.frame()
substitute(list(...)) %>%
as.list %T>%
{if (length(.) < 4)
stop('\ncond requires at least 3 arguments!',
call.=FALSE)} %>%
tail(-1) %T>%
{if (length(.) %% 2 != 1)
stop('\ncond requires an uneven number of arguments!',
call.=FALSE)} %>%
split(((seq_along(.) + 1)/2) %>%
floor) %>%
rev %>%
{c(.[[1]], tail(., -1))} %>%
Reduce(function(x,y)
list(`if`, y[[1]], y[[2]], x) %>%
as.call, .) %>%
eval(envir=e)
}
# Demo --------------------------------------------------------------------
wd <- getwd()
tmpdir <- tempdir()
setwd(tmpdir)
if (dir.exists('.cache.db')) unlink('.cache.db', recursive=TRUE)
dir.create('.cache.db')
# Let's pretend we have 3 complicated pure functions
# each consuming some time when re-evaluated:
f1 <- function(vec, val) {
Sys.sleep(1)
vec + val
}
f2 <- function(vec, val) {
Sys.sleep(1)
mean(vec)
}
f3 <- function(val1, val2) {
Sys.sleep(1)
val1/val2
}
system.time(Res1 <- 1:100 %>%
cachedCall(f1, vec=., val=3) %>%
cachedCall(f2, .) %>%
cachedCall(f3, val1=., val2=50) %>%
extractVal)
system.time(Res2 <- 1:100 %>%
cachedCall(f1, vec=., val=3) %>%
cachedCall(f2, .) %>%
cachedCall(f3, val1=., val2=50) %>%
extractVal)
Res1 == Res2
# Just that function (f3) is re-evaluated due to a change in
# the value of one of the args (if there were further steps
# beyond f3, they would be also re-evaluated):
system.time(1:100 %>%
cachedCall(f1, vec=., val=3) %>%
cachedCall(f2, .) %>%
cachedCall(f3, val1=., val2=100) %>%
extractVal)
# Of course, a modification of a function also triggers re-evaluation
# of the modified and the subsequent (dependent) step(s):
f2 <- function(vec, val) {
Sys.sleep(1)
mean(vec)/3
}
system.time(1:100 %>%
cachedCall(f1, vec=., val=3) %>%
cachedCall(f2, .) %>%
cachedCall(f3, val1=., val2=100) %>%
extractVal)
# Paths to files need to be wrapped in File()
# when used as arguments inside cachedCall
# (so that possible changes in the contents
# of the files are assessed instead of the
# changes in the paths):
tmpf <- tempfile()
cat(letters,
file=tmpf, sep='\n')
f4 <- function(filepath) {
Sys.sleep(1)
readLines(filepath)
}
system.time(ResA <-
cachedCall(f4, File(tmpf)) %>%
extractVal)
system.time(ResB <-
cachedCall(f4, File(tmpf)) %>%
extractVal)
identical(ResA, ResB)
# Re-evaluated when the file modified:
cat(c(letters,1:10),
file=tmpf, sep='\n')
system.time(cachedCall(f4, File(tmpf)) %>%
extractVal)
setwd(wd)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment