Last active
March 24, 2018 03:58
-
-
Save fstpackage/83bdf10ae33018d8ea2c02d08fcedfc8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################################################ | |
# Benchmark fst against saveRDS, fread/fwrite, and feather # | |
############################################################################################ | |
# get data.table dev version from GitHub (multithreaded): | |
# install.packages("data.table", type = "source", repos = "http://Rdatatable.github.io/data.table") | |
require(fst) # v0.7.2 | |
require(data.table) # dev version v1.10.5 (2017/4/14) | |
require(feather) # v0.3.1 | |
require(ggplot2) | |
require(microbenchmark) | |
# Helper function for creating a random data frame | |
SampleSet <- function(nrOfRows) | |
{ | |
data.frame( | |
Integers = 1:nrOfRows, # integer | |
Logicals = sample(c(TRUE, FALSE, NA), nrOfRows, replace = TRUE), # logical | |
Factor = factor(sample(state.name, nrOfRows, replace = TRUE)), # text | |
Text = sample(state.name, nrOfRows, replace = TRUE), # text | |
Numericals = runif(nrOfRows, 0.0, 100), # numericals | |
stringsAsFactors = FALSE) | |
} | |
# Generate a random data frame with 10 million rows | |
x <- SampleSet(1e7) | |
# Add a single observation to benchmark | |
Observation <- function(bench, package, compression, mode, size, time, id) | |
{ | |
cat(".") | |
rbindlist(list(bench, data.table(Package = package, Compression = compression, Mode= mode, ID = id, | |
Size = size, Time = time))) | |
} | |
############################################################################################ | |
# Benchmark specific package # | |
############################################################################################ | |
# Benchmark a single read/write method combination | |
PackageBench <- function(bench, x, iterations, measureID, extension, writeMethod, readMethod) | |
{ | |
saveRDS("warmup disk", "warmup.rds") | |
for (id in 1:iterations) | |
{ | |
rdsName <- paste0("dataset", id, "_", measureID, ".", extension) | |
# Only a single iteration is used to avoid disk caching effects | |
# Due to caching measured speeds are higher and create a unrealistic benchmark | |
res = microbenchmark( | |
{ | |
writeMethod(x, rdsName) | |
}, times = 1) | |
bench <- Observation(bench, extension, 0, "write", file.info(rdsName)$size, res$time, measureID) | |
} | |
for (id in 1:iterations) | |
{ | |
rdsName <- paste0("dataset", id, "_", measureID, ".", extension) | |
res = microbenchmark( | |
{ | |
readMethod(rdsName) | |
}, times = 1) | |
bench <- Observation(bench, extension, 0, "read", file.info(rdsName)$size, res$time, measureID) | |
} | |
bench | |
} | |
# We need special arguments for these methods | |
rdsWrite <- function(x, fileName){ saveRDS(x, fileName, compress = FALSE) } | |
csvRead <- function(x){ fread(x, sep = ";") } | |
# Just run until manually stopped | |
while (TRUE) | |
{ | |
bench <- NULL | |
for (colName in colNames(x)) | |
{ | |
cat("\n", colName, ": ", sep = "") | |
bench <- PackageBench(bench, x[colName], 5, colName, "rds", rdsWrite, readRDS) | |
bench <- PackageBench(bench, x[colName], 5, colName, "csv", fwrite, csvRead) | |
bench <- PackageBench(bench, x[colName], 5, colName, "fst", write.fst, read.fst) | |
bench <- PackageBench(bench, x[colName], 5, colName, "fea", write_feather, read_feather) | |
} | |
# Cache results fst file | |
if (!file.exists("speeds.fst")) | |
{ | |
write.fst(bench, "speeds.fst") | |
} else | |
{ | |
# Add to existing benchmark results | |
oldSpeeds <- read.fst("speeds.fst") | |
bench <- rbindlist(list(oldSpeeds, bench)) | |
write.fst(bench, "speeds.fst") | |
} | |
} | |
############################################################################################ | |
# Benchmark figures # | |
############################################################################################ | |
require(fst) | |
require(data.table) | |
require(ggplot2) | |
speeds <- copy(read.fst("speeds.fst", as.data.table = TRUE)) | |
speeds[, MemSize := Size[which(Package == "rds")[1]] , by = "ID"] | |
speeds[, Factor := Size / MemSize] | |
speeds[, Speed := 1e3 * MemSize / Time] | |
# csv files with logicals excluded, mal-formed csv files if NA's are used | |
# Reproducible example: | |
# fwrite(data.frame(A = c(TRUE, NA, FALSE)), "test.csv") | |
# fread("test.csv") | |
speeds <- speeds[!(Package == "csv" & ID == "Logicals"), ] | |
# Violin plot | |
ggplot(speeds) + | |
geom_violin(aes(Mode, Speed, colour = Mode)) + | |
geom_jitter(aes(Mode, Speed, colour = Mode), size = 1.5, width = 0.1) + | |
facet_grid(ID ~ Package, scales = "free_y") + | |
theme(legend.justification=c(0, 0), legend.position=c(0.85, 0.05)) | |
avg_speed <- speeds[, .(MemSize = MemSize[1], Time = median(Time)), by = "Package,Mode,ID"] | |
avg_speed[, Speed := 1e3 * MemSize / Time] | |
# bar chart with speeds | |
ggplot(avg_speed) + | |
geom_bar(aes(Package, weight = Speed, fill = Package), position = "dodge") + | |
facet_grid(Mode~ID) + | |
theme(legend.justification=c(0, 0), legend.position=c(0.88, 0.25)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment