Last active
April 12, 2017 21:08
-
-
Save fstpackage/e710cf2adaa02f96bc8dbee9318ba733 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################################################ | |
# Benchmark fst against saveRDS, fread/fwrite, and feather # | |
############################################################################################ | |
require(fst) # v0.7.2 | |
require(data.table) # v1.10.0 | |
require(feather) # v0.3.1 | |
require(ggplot2) | |
require(microbenchmark) | |
# Helper function for creating a random data frame | |
SampleSet <- function(nrOfRows) | |
{ | |
data.frame( | |
Integers = 1:nrOfRows, # integer | |
Logicals = sample(c(TRUE, FALSE, NA), nrOfRows, replace = TRUE), # logical | |
Text = factor(sample(state.name, nrOfRows, replace = TRUE)), # text | |
Numericals = runif(nrOfRows, 0.0, 100), # numericals | |
stringsAsFactors = FALSE) | |
} | |
# Generate a random data frame with 10 million rows | |
x <- SampleSet(1e7) | |
# Add a single observation to benchmark | |
Observation <- function(bench, package, compression, size, time) | |
{ | |
cat(".") | |
rbindlist(list(bench, data.table(Package = package, Compression = compression, | |
Size = size, Time = res$time))) | |
} | |
############################################################################################ | |
# Benchmark saveRDS and readRDS # | |
############################################################################################ | |
benchRead <- NULL | |
benchWrite <- NULL | |
iterations <- 10 # number of iterations for each benchmark | |
# Run until manualy stopped () | |
while (TRUE) | |
{ | |
saveRDS("warmup disk", "warmup.rds") | |
for (id in 1:iterations) | |
{ | |
rdsName <- paste("dataset", id, "_", "0.rds", sep = "") | |
# Only a single iteration is used to avoid disk caching effects | |
# Due to caching measured speeds are higher and create a unrealistic benchmark | |
res = microbenchmark( | |
{ | |
saveRDS(x, rdsName, compress = FALSE) | |
}, times = 1) | |
benchWrite <- Observation(benchWrite, "rds", 0, file.info(rdsName)$size, res$time) | |
} | |
# Compressed write with saveRDS | |
for (compression in 1:9) | |
{ | |
rdsName <- paste("dataset", id, "_", compression, ".rds", sep = "") | |
res = microbenchmark( | |
{ | |
fFile <- gzfile(rdsName, "wb", compression = compression) | |
saveRDS(x, fFile) | |
close(fFile) | |
}, times = 1) | |
benchWrite <- Observation(benchWrite, "rds", 100 * compression / 9, file.info(rdsName)$size, res$time) | |
} | |
#Read rds | |
for (compression in 0:9) | |
{ | |
rdsName <- paste("dataset", id, "_", compression, ".rds", sep = "") | |
res = microbenchmark( | |
{ | |
readRDS(rdsName) | |
}, times = 1) | |
benchRead <- Observation(benchRead, "rds", 100 * compression / 9, file.info(rdsName)$size, res$time) | |
} | |
for (id in 1:iterations) | |
{ | |
rdsName <- paste("dataset", id, "_", 0, ".rds", sep = "") | |
res = microbenchmark( | |
{ | |
readRDS(rdsName) | |
}, times = 1) | |
benchRead <- Observation(benchRead, "rds", 0, file.info(rdsName)$size, res$time) | |
} | |
############################################################################################ | |
# Benchmark data.table's fread and fwrite # | |
############################################################################################ | |
saveRDS("warmup disk", "warmup.rds") | |
for (id in 1:iterations) | |
{ | |
# Write x to csv file | |
csvName <- paste("dataset", id, "_", 0, ".csv", sep = "") | |
saveRDS("warmup disk", "warmup.rds") | |
res = microbenchmark( | |
{ | |
fwrite(x, csvName, row.names = FALSE) | |
}, times = 1) | |
benchWrite <- Observation(benchWrite, "csv", 0, file.info(csvName)$size, res$time) | |
} | |
for (id in 1:iterations) | |
{ | |
# Write x to csv file | |
csvName <- paste("dataset", id, "_", 0, ".csv", sep = "") | |
res = microbenchmark( | |
{ | |
fread(csvName, header = TRUE, sep = ";", na.strings = "NA") | |
}, times = 1) | |
benchRead <- Observation(benchRead, "csv", 0, file.info(csvName)$size, res$time) | |
} | |
############################################################################################ | |
# Benchmark fst # | |
############################################################################################ | |
saveRDS("warmup disk", "warmup.rds") | |
for (id in 1:iterations) | |
{ | |
# Write x to feather file | |
fstName <- paste("dataset", id, ".fst", sep = "") | |
res = microbenchmark( | |
{ | |
write.fst(x, fstName) | |
}, times = 1) | |
benchWrite <- Observation(benchWrite, "fst", 0, file.info(fstName)$size, res$time) | |
} | |
for (id in 1:iterations) | |
{ | |
# Write x to feather file | |
fstName <- paste("dataset", id, ".fst", sep = "") | |
res = microbenchmark( | |
{ | |
read.fst(fstName) | |
}, times = 1) | |
benchRead <- Observation(benchRead, "fst", 0, file.info(fstName)$size, res$time) | |
} | |
# write.fst | |
saveRDS("warmup disk", "warmup.rds") | |
for (compression in 2 * (50:0)) | |
{ | |
fstName <- paste("dataset", compression, ".fst", sep = "") | |
res = microbenchmark( | |
write.fst(x, fstName, compress = compression) | |
, times = 1) | |
benchWrite <- Observation(benchWrite, "fst", compression, file.info(fstName)$size, res$time) | |
} | |
# Read benchmark | |
for (compression in 2 * (50:0)) | |
{ | |
fstName <- paste("dataset", compression, ".fst", sep = "") | |
res = microbenchmark( | |
read.fst(fstName) | |
, times = 1) | |
benchRead <- Observation(benchRead, "fst", compression, file.info(fstName)$size, res$time) | |
} | |
############################################################################################ | |
# Benchmark feather # | |
############################################################################################ | |
saveRDS("warmup disk", "warmup.rds") | |
for (id in 1:iterations) | |
{ | |
# Write x to feather file | |
featherName <- paste("dataset", id, "_", 0, ".fea", sep = "") | |
res = microbenchmark( | |
{ | |
write_feather(x, featherName) | |
}, times = 1) | |
benchWrite <- Observation(benchWrite, "feather", 0, file.info(featherName)$size, res$time) | |
} | |
for (id in 1:iterations) | |
{ | |
# Read x from feather file | |
featherName <- paste("dataset", id, "_", 0, ".fea", sep = "") | |
res = microbenchmark( | |
{ | |
read_feather(featherName) | |
}, times = 1) | |
benchRead <- Observation(benchRead, "feather", 0, file.info(featherName)$size, res$time) | |
} | |
# Uncompressed graph | |
memSize <- benchWrite[Package == "rds", Size[1]] | |
speeds <- rbindlist(list( | |
benchRead[, list(Package, Compression, Size, Mode = "Read", Speed = 1e3 * memSize / Time)], | |
benchWrite[, list(Package, Compression, Size, Mode = "Write", Speed = 1e3 * memSize / Time)])) | |
if (!file.exists("speeds.fst")) | |
{ | |
write.fst(speeds, "speeds.fst") | |
} else | |
{ | |
# Add to existing benchmark results | |
oldSpeeds <- read.fst("speeds.fst") | |
speeds <- rbindlist(list(oldSpeeds, speeds)) | |
write.fst(speeds, "speeds.fst") | |
} | |
} | |
############################################################################################ | |
# fstpackage.org benchmark figures # | |
############################################################################################ | |
require(fst) | |
require(data.table) | |
require(ggplot2) | |
speeds <- read.fst("speeds.fst", as.data.table = TRUE) | |
memSize <- speeds[Package == "saveRDS / readRDS" & Compression == 0, Size[1]] | |
speeds <- copy(speeds) # avoid data.table warning message 'Invalid selfref' | |
speeds[, Factor := Size / memSize] | |
speeds[Package == "csv", Package := "write.csv2 / fread"] | |
speeds[Package == "rds", Package := "saveRDS / readRDS"] | |
# Violin plot | |
ggplot(speeds[Compression == 0]) + | |
geom_violin(aes(Mode, Speed, colour = Mode)) + | |
geom_jitter(aes(Mode, Speed, colour = Mode), size = 1.5, width = 0.1) + | |
facet_wrap(~ Package, 1) + | |
theme(legend.justification=c(0, 0), legend.position=c(0.8, 0.7)) | |
# Compression plot | |
ggplot(speeds) + | |
geom_line(aes(Compression, 100 * Factor, colour = Package)) + | |
geom_point(aes(Compression, 100 * Factor, colour = Package, shape = Package), size = 2) + | |
ylab("Percentage of in-memory size") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment