Skip to content

Instantly share code, notes, and snippets.

@fstpackage
Last active March 24, 2018 03:58
Show Gist options
  • Save fstpackage/83bdf10ae33018d8ea2c02d08fcedfc8 to your computer and use it in GitHub Desktop.
Save fstpackage/83bdf10ae33018d8ea2c02d08fcedfc8 to your computer and use it in GitHub Desktop.
############################################################################################
# Benchmark fst against saveRDS, fread/fwrite, and feather #
############################################################################################
# get data.table dev version from GitHub (multithreaded):
# install.packages("data.table", type = "source", repos = "http://Rdatatable.github.io/data.table")
require(fst) # v0.7.2
require(data.table) # dev version v1.10.5 (2017/4/14)
require(feather) # v0.3.1
require(ggplot2)
require(microbenchmark)
# Helper function for creating a random data frame
SampleSet <- function(nrOfRows)
{
data.frame(
Integers = 1:nrOfRows, # integer
Logicals = sample(c(TRUE, FALSE, NA), nrOfRows, replace = TRUE), # logical
Factor = factor(sample(state.name, nrOfRows, replace = TRUE)), # text
Text = sample(state.name, nrOfRows, replace = TRUE), # text
Numericals = runif(nrOfRows, 0.0, 100), # numericals
stringsAsFactors = FALSE)
}
# Generate a random data frame with 10 million rows
x <- SampleSet(1e7)
# Add a single observation to benchmark
Observation <- function(bench, package, compression, mode, size, time, id)
{
cat(".")
rbindlist(list(bench, data.table(Package = package, Compression = compression, Mode= mode, ID = id,
Size = size, Time = time)))
}
############################################################################################
# Benchmark specific package #
############################################################################################
# Benchmark a single read/write method combination
PackageBench <- function(bench, x, iterations, measureID, extension, writeMethod, readMethod)
{
saveRDS("warmup disk", "warmup.rds")
for (id in 1:iterations)
{
rdsName <- paste0("dataset", id, "_", measureID, ".", extension)
# Only a single iteration is used to avoid disk caching effects
# Due to caching measured speeds are higher and create a unrealistic benchmark
res = microbenchmark(
{
writeMethod(x, rdsName)
}, times = 1)
bench <- Observation(bench, extension, 0, "write", file.info(rdsName)$size, res$time, measureID)
}
for (id in 1:iterations)
{
rdsName <- paste0("dataset", id, "_", measureID, ".", extension)
res = microbenchmark(
{
readMethod(rdsName)
}, times = 1)
bench <- Observation(bench, extension, 0, "read", file.info(rdsName)$size, res$time, measureID)
}
bench
}
# We need special arguments for these methods
rdsWrite <- function(x, fileName){ saveRDS(x, fileName, compress = FALSE) }
csvRead <- function(x){ fread(x, sep = ";") }
# Just run until manually stopped
while (TRUE)
{
bench <- NULL
for (colName in colNames(x))
{
cat("\n", colName, ": ", sep = "")
bench <- PackageBench(bench, x[colName], 5, colName, "rds", rdsWrite, readRDS)
bench <- PackageBench(bench, x[colName], 5, colName, "csv", fwrite, csvRead)
bench <- PackageBench(bench, x[colName], 5, colName, "fst", write.fst, read.fst)
bench <- PackageBench(bench, x[colName], 5, colName, "fea", write_feather, read_feather)
}
# Cache results fst file
if (!file.exists("speeds.fst"))
{
write.fst(bench, "speeds.fst")
} else
{
# Add to existing benchmark results
oldSpeeds <- read.fst("speeds.fst")
bench <- rbindlist(list(oldSpeeds, bench))
write.fst(bench, "speeds.fst")
}
}
############################################################################################
# Benchmark figures #
############################################################################################
require(fst)
require(data.table)
require(ggplot2)
speeds <- copy(read.fst("speeds.fst", as.data.table = TRUE))
speeds[, MemSize := Size[which(Package == "rds")[1]] , by = "ID"]
speeds[, Factor := Size / MemSize]
speeds[, Speed := 1e3 * MemSize / Time]
# csv files with logicals excluded, mal-formed csv files if NA's are used
# Reproducible example:
# fwrite(data.frame(A = c(TRUE, NA, FALSE)), "test.csv")
# fread("test.csv")
speeds <- speeds[!(Package == "csv" & ID == "Logicals"), ]
# Violin plot
ggplot(speeds) +
geom_violin(aes(Mode, Speed, colour = Mode)) +
geom_jitter(aes(Mode, Speed, colour = Mode), size = 1.5, width = 0.1) +
facet_grid(ID ~ Package, scales = "free_y") +
theme(legend.justification=c(0, 0), legend.position=c(0.85, 0.05))
avg_speed <- speeds[, .(MemSize = MemSize[1], Time = median(Time)), by = "Package,Mode,ID"]
avg_speed[, Speed := 1e3 * MemSize / Time]
# bar chart with speeds
ggplot(avg_speed) +
geom_bar(aes(Package, weight = Speed, fill = Package), position = "dodge") +
facet_grid(Mode~ID) +
theme(legend.justification=c(0, 0), legend.position=c(0.88, 0.25))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment