Skip to content

Instantly share code, notes, and snippets.

@russellpierce
Created May 29, 2015 09:05
Show Gist options
  • Save russellpierce/2f69fc2ac8d37ed724d9 to your computer and use it in GitHub Desktop.
Save russellpierce/2f69fc2ac8d37ed724d9 to your computer and use it in GitHub Desktop.
Provided the right tools are installed, i.e. xz and pigz, will offload the compression handling to an external program and leave R free to do the data import. This ends up being quite a bit more efficient for large files. Some tweaks may be needed for operating systems other than Ubuntu; there may be additional dependencies on the github repo dr…
library(parallel)
saveRDS.xz <- function(object,file,threads=parallel::detectCores()) {
pxzAvail <- any(grepl("(XZ Utils)",system("pxz -V",intern=TRUE)))
if (pxzAvail) {
con <- pipe(paste0("pxz -T",threads," > ",file),"wb")
base::saveRDS(object, file = con)
close(con)
} else {
saveRDS(object,file=file,compress="xz")
}
}
#pxz does not appear to decompress in parallel and appears compute limited in R, but offloading the decompression to a seperate thread provides some speed benefit
#this code should work for any regular RDS files saved using xz compression
readRDS.xz <- function(file,threads=parallel::detectCores()) {
con <- pipe(paste0("pxz -d -k -c -T",threads," ",file))
object <- readRDS(file = con)
close(con)
return(object)
}
saveRDS.gz <- function(object,file,threads=parallel::detectCores(),compression_level=6) {
con <- pipe(paste0("pigz -c",compression_level," -p",threads," > ",file),"wb")
saveRDS(object, file = con)
close(con)
}
#pxz does not appear to decompress in parallel and appears compute limited in R, but offloading the decompression to a seperate thread provides some speed benefit
#this code should work for any regular RDS files saved using xz compression
readRDS.gz <- function(file,threads=parallel::detectCores()) {
con <- pipe(paste0("pigz -d -c -p",threads," ",file))
object <- base::readRDS(file = con)
close(con)
return(object)
}
readRDS.p <- function(file,threads=parallel::detectCores()) {
#Hypothetically we could use initial bytes to determine file format, but here we use the Linux command file because the readBin implementation was not immediately obvious
if (!file.exists(file)) {stop(paste0(file," does not exist!"))}
fileDetails <- system2("file",args=file,stdout=TRUE)
selector <- sapply(c("gzip","XZ"),function (x) {grepl(x,fileDetails)})
format <- names(selector)[selector]
if (length(format)==0) {format <- "not found"}
if (format == "gzip") {
object <- readRDS.gz(file, threads=threads)
} else if (format == "XZ") {
object <- readRDS.xz(file, threads=threads)
} else {
object <- base::readRDS(file)
}
return(object)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment