Skip to content

Instantly share code, notes, and snippets.

@fstpackage
Last active April 12, 2017 21:08
Show Gist options
  • Save fstpackage/e710cf2adaa02f96bc8dbee9318ba733 to your computer and use it in GitHub Desktop.
Save fstpackage/e710cf2adaa02f96bc8dbee9318ba733 to your computer and use it in GitHub Desktop.
############################################################################################
# Benchmark fst against saveRDS, fread/fwrite, and feather #
############################################################################################
require(fst) # v0.7.2
require(data.table) # v1.10.0
require(feather) # v0.3.1
require(ggplot2)
require(microbenchmark)
# Helper function for creating a random data frame
SampleSet <- function(nrOfRows)
{
data.frame(
Integers = 1:nrOfRows, # integer
Logicals = sample(c(TRUE, FALSE, NA), nrOfRows, replace = TRUE), # logical
Text = factor(sample(state.name, nrOfRows, replace = TRUE)), # text
Numericals = runif(nrOfRows, 0.0, 100), # numericals
stringsAsFactors = FALSE)
}
# Generate a random data frame with 10 million rows
x <- SampleSet(1e7)
# Add a single observation to benchmark
Observation <- function(bench, package, compression, size, time)
{
cat(".")
rbindlist(list(bench, data.table(Package = package, Compression = compression,
Size = size, Time = res$time)))
}
############################################################################################
# Benchmark saveRDS and readRDS #
############################################################################################
benchRead <- NULL
benchWrite <- NULL
iterations <- 10 # number of iterations for each benchmark
# Run until manualy stopped ()
while (TRUE)
{
saveRDS("warmup disk", "warmup.rds")
for (id in 1:iterations)
{
rdsName <- paste("dataset", id, "_", "0.rds", sep = "")
# Only a single iteration is used to avoid disk caching effects
# Due to caching measured speeds are higher and create a unrealistic benchmark
res = microbenchmark(
{
saveRDS(x, rdsName, compress = FALSE)
}, times = 1)
benchWrite <- Observation(benchWrite, "rds", 0, file.info(rdsName)$size, res$time)
}
# Compressed write with saveRDS
for (compression in 1:9)
{
rdsName <- paste("dataset", id, "_", compression, ".rds", sep = "")
res = microbenchmark(
{
fFile <- gzfile(rdsName, "wb", compression = compression)
saveRDS(x, fFile)
close(fFile)
}, times = 1)
benchWrite <- Observation(benchWrite, "rds", 100 * compression / 9, file.info(rdsName)$size, res$time)
}
#Read rds
for (compression in 0:9)
{
rdsName <- paste("dataset", id, "_", compression, ".rds", sep = "")
res = microbenchmark(
{
readRDS(rdsName)
}, times = 1)
benchRead <- Observation(benchRead, "rds", 100 * compression / 9, file.info(rdsName)$size, res$time)
}
for (id in 1:iterations)
{
rdsName <- paste("dataset", id, "_", 0, ".rds", sep = "")
res = microbenchmark(
{
readRDS(rdsName)
}, times = 1)
benchRead <- Observation(benchRead, "rds", 0, file.info(rdsName)$size, res$time)
}
############################################################################################
# Benchmark data.table's fread and fwrite #
############################################################################################
saveRDS("warmup disk", "warmup.rds")
for (id in 1:iterations)
{
# Write x to csv file
csvName <- paste("dataset", id, "_", 0, ".csv", sep = "")
saveRDS("warmup disk", "warmup.rds")
res = microbenchmark(
{
fwrite(x, csvName, row.names = FALSE)
}, times = 1)
benchWrite <- Observation(benchWrite, "csv", 0, file.info(csvName)$size, res$time)
}
for (id in 1:iterations)
{
# Write x to csv file
csvName <- paste("dataset", id, "_", 0, ".csv", sep = "")
res = microbenchmark(
{
fread(csvName, header = TRUE, sep = ";", na.strings = "NA")
}, times = 1)
benchRead <- Observation(benchRead, "csv", 0, file.info(csvName)$size, res$time)
}
############################################################################################
# Benchmark fst #
############################################################################################
saveRDS("warmup disk", "warmup.rds")
for (id in 1:iterations)
{
# Write x to feather file
fstName <- paste("dataset", id, ".fst", sep = "")
res = microbenchmark(
{
write.fst(x, fstName)
}, times = 1)
benchWrite <- Observation(benchWrite, "fst", 0, file.info(fstName)$size, res$time)
}
for (id in 1:iterations)
{
# Write x to feather file
fstName <- paste("dataset", id, ".fst", sep = "")
res = microbenchmark(
{
read.fst(fstName)
}, times = 1)
benchRead <- Observation(benchRead, "fst", 0, file.info(fstName)$size, res$time)
}
# write.fst
saveRDS("warmup disk", "warmup.rds")
for (compression in 2 * (50:0))
{
fstName <- paste("dataset", compression, ".fst", sep = "")
res = microbenchmark(
write.fst(x, fstName, compress = compression)
, times = 1)
benchWrite <- Observation(benchWrite, "fst", compression, file.info(fstName)$size, res$time)
}
# Read benchmark
for (compression in 2 * (50:0))
{
fstName <- paste("dataset", compression, ".fst", sep = "")
res = microbenchmark(
read.fst(fstName)
, times = 1)
benchRead <- Observation(benchRead, "fst", compression, file.info(fstName)$size, res$time)
}
############################################################################################
# Benchmark feather #
############################################################################################
saveRDS("warmup disk", "warmup.rds")
for (id in 1:iterations)
{
# Write x to feather file
featherName <- paste("dataset", id, "_", 0, ".fea", sep = "")
res = microbenchmark(
{
write_feather(x, featherName)
}, times = 1)
benchWrite <- Observation(benchWrite, "feather", 0, file.info(featherName)$size, res$time)
}
for (id in 1:iterations)
{
# Read x from feather file
featherName <- paste("dataset", id, "_", 0, ".fea", sep = "")
res = microbenchmark(
{
read_feather(featherName)
}, times = 1)
benchRead <- Observation(benchRead, "feather", 0, file.info(featherName)$size, res$time)
}
# Uncompressed graph
memSize <- benchWrite[Package == "rds", Size[1]]
speeds <- rbindlist(list(
benchRead[, list(Package, Compression, Size, Mode = "Read", Speed = 1e3 * memSize / Time)],
benchWrite[, list(Package, Compression, Size, Mode = "Write", Speed = 1e3 * memSize / Time)]))
if (!file.exists("speeds.fst"))
{
write.fst(speeds, "speeds.fst")
} else
{
# Add to existing benchmark results
oldSpeeds <- read.fst("speeds.fst")
speeds <- rbindlist(list(oldSpeeds, speeds))
write.fst(speeds, "speeds.fst")
}
}
############################################################################################
# fstpackage.org benchmark figures #
############################################################################################
require(fst)
require(data.table)
require(ggplot2)
speeds <- read.fst("speeds.fst", as.data.table = TRUE)
memSize <- speeds[Package == "saveRDS / readRDS" & Compression == 0, Size[1]]
speeds <- copy(speeds) # avoid data.table warning message 'Invalid selfref'
speeds[, Factor := Size / memSize]
speeds[Package == "csv", Package := "write.csv2 / fread"]
speeds[Package == "rds", Package := "saveRDS / readRDS"]
# Violin plot
ggplot(speeds[Compression == 0]) +
geom_violin(aes(Mode, Speed, colour = Mode)) +
geom_jitter(aes(Mode, Speed, colour = Mode), size = 1.5, width = 0.1) +
facet_wrap(~ Package, 1) +
theme(legend.justification=c(0, 0), legend.position=c(0.8, 0.7))
# Compression plot
ggplot(speeds) +
geom_line(aes(Compression, 100 * Factor, colour = Package)) +
geom_point(aes(Compression, 100 * Factor, colour = Package, shape = Package), size = 2) +
ylab("Percentage of in-memory size")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment