Last active
June 15, 2016 12:45
-
-
Save alekrutkowski/6a5fa9b610f260c9199e61b571dfdd0c to your computer and use it in GitHub Desktop.
A simple caching system for R (persistent, disk-based)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(magrittr) | |
stopifnot('digest' %in% installed.packages()[,"Package"]) | |
if (!dir.exists('.cache.db')) dir.create('.cache.db') # in the current working directory | |
# Public API -------------------------------------------------------------- | |
cachedCall <- | |
function(`fun*`, | |
..., # file paths/names in args in ... need to be wrapped in File() | |
`*cache_dir*`=paste0(getwd(),'/.cache.db/'), | |
`*load_cached*`=FALSE) { | |
# Can both create/return CachedResult and | |
# absorb in ... (as plain/unwrapped arguments) previous CachedResult(s). | |
# `fun*` should be a pure function that always returns the same result value given the same arguments | |
# (see https://en.wikipedia.org/wiki/Pure_function) | |
signat <- list(`fun*`, ...) %>% | |
lapply(extractSignat) %>% | |
digest::digest() | |
fname <- paste0(`*cache_dir*`, | |
signat, | |
'.Rds') | |
`if`(file.exists(fname), | |
`if`(`*load_cached*`, | |
list(signat=signat, | |
val=readRDS(fname)), | |
list(signat=signat)) %>% | |
addClass('CachedResult'), | |
list(...) %>% | |
lapply(extractVal) %>% | |
do.call(`fun*`, .) %T>% | |
saveRDS(fname) %>% | |
list(signat=signat, | |
val=.) %>% | |
addClass('CachedResult')) | |
} | |
extractVal <- function(arg, | |
`*cache_dir*`=paste0(getwd(),'/.cache.db/')) | |
# CachedResult(s) need to be wrapped in this function | |
# if used outside cachedCall as arguments in other functions | |
`if`(arg %>% inherits('CachedResult'), | |
`if`(arg %>% containsVal, | |
arg$val, | |
readRDS(paste0(`*cache_dir*`, | |
arg$signat, | |
'.Rds'))), | |
arg) | |
# Helpers/private --------------------------------------------------------- | |
extractSignat <- function(elem) | |
cond(elem %>% inherits('CachedResult'), | |
elem$signat, | |
elem %>% inherits('File'), # file paths/names in args in ... in cachedCall need to be wrapped in File() | |
digest::digest(elem, file=TRUE), | |
digest::digest(elem)) | |
File <- function(path) | |
path %>% | |
addClass('File') | |
addClass <- function(obj, ClassName) | |
`class<-`(obj, | |
c(ClassName, | |
class(obj))) | |
containsVal <- function(CachedResult) | |
'val' %in% names(CachedResult) | |
cond <- function(...) { | |
e <- parent.frame() | |
substitute(list(...)) %>% | |
as.list %T>% | |
{if (length(.) < 4) | |
stop('\ncond requires at least 3 arguments!', | |
call.=FALSE)} %>% | |
tail(-1) %T>% | |
{if (length(.) %% 2 != 1) | |
stop('\ncond requires an uneven number of arguments!', | |
call.=FALSE)} %>% | |
split(((seq_along(.) + 1)/2) %>% | |
floor) %>% | |
rev %>% | |
{c(.[[1]], tail(., -1))} %>% | |
Reduce(function(x,y) | |
list(`if`, y[[1]], y[[2]], x) %>% | |
as.call, .) %>% | |
eval(envir=e) | |
} | |
# Demo -------------------------------------------------------------------- | |
wd <- getwd() | |
tmpdir <- tempdir() | |
setwd(tmpdir) | |
if (dir.exists('.cache.db')) unlink('.cache.db', recursive=TRUE) | |
dir.create('.cache.db') | |
# Let's pretend we have 3 complicated pure functions | |
# each consuming some time when re-evaluated: | |
f1 <- function(vec, val) { | |
Sys.sleep(1) | |
vec + val | |
} | |
f2 <- function(vec, val) { | |
Sys.sleep(1) | |
mean(vec) | |
} | |
f3 <- function(val1, val2) { | |
Sys.sleep(1) | |
val1/val2 | |
} | |
system.time(Res1 <- 1:100 %>% | |
cachedCall(f1, vec=., val=3) %>% | |
cachedCall(f2, .) %>% | |
cachedCall(f3, val1=., val2=50) %>% | |
extractVal) | |
system.time(Res2 <- 1:100 %>% | |
cachedCall(f1, vec=., val=3) %>% | |
cachedCall(f2, .) %>% | |
cachedCall(f3, val1=., val2=50) %>% | |
extractVal) | |
Res1 == Res2 | |
# Just that function (f3) is re-evaluated due to a change in | |
# the value of one of the args (if there were further steps | |
# beyond f3, they would be also re-evaluated): | |
system.time(1:100 %>% | |
cachedCall(f1, vec=., val=3) %>% | |
cachedCall(f2, .) %>% | |
cachedCall(f3, val1=., val2=100) %>% | |
extractVal) | |
# Of course, a modification of a function also triggers re-evaluation | |
# of the modified and the subsequent (dependent) step(s): | |
f2 <- function(vec, val) { | |
Sys.sleep(1) | |
mean(vec)/3 | |
} | |
system.time(1:100 %>% | |
cachedCall(f1, vec=., val=3) %>% | |
cachedCall(f2, .) %>% | |
cachedCall(f3, val1=., val2=100) %>% | |
extractVal) | |
# Paths to files need to be wrapped in File() | |
# when used as arguments inside cachedCall | |
# (so that possible changes in the contents | |
# of the files are assessed instead of the | |
# changes in the paths): | |
tmpf <- tempfile() | |
cat(letters, | |
file=tmpf, sep='\n') | |
f4 <- function(filepath) { | |
Sys.sleep(1) | |
readLines(filepath) | |
} | |
system.time(ResA <- | |
cachedCall(f4, File(tmpf)) %>% | |
extractVal) | |
system.time(ResB <- | |
cachedCall(f4, File(tmpf)) %>% | |
extractVal) | |
identical(ResA, ResB) | |
# Re-evaluated when the file modified: | |
cat(c(letters,1:10), | |
file=tmpf, sep='\n') | |
system.time(cachedCall(f4, File(tmpf)) %>% | |
extractVal) | |
setwd(wd) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment