Skip to content

Instantly share code, notes, and snippets.

@oseiskar
Last active September 18, 2019 20:29
Show Gist options
  • Save oseiskar/c8117c5eed1c4522f7e0 to your computer and use it in GitHub Desktop.
Save oseiskar/c8117c5eed1c4522f7e0 to your computer and use it in GitHub Desktop.
library(data.table)
library(microbenchmark)
performanceTest <- function (nrows=100000, ncols=400, quote=TRUE, col.type='character') {
message(paste0('generating test table, col type ', col.type))
x <- c(1:nrows)
dt <- data.table(col1=x)
col_generators = list(
character = function(x) paste0('char', x),
double = function(x) as.double(x / 0.9),
integer = function(x) x,
factor = function(x) as.factor(paste0('factor', x %% 1000))
)
if (col.type != 'mixed') {
col_generators <- col_generators[col.type]
}
for (col in c(1:ncols)) {
col_generator <- col_generators[[((col-1) %% length(col_generators))+1]]
dt[, (paste0('col', col)) := col_generator(x)]
}
message(paste0('dt of size ', nrow(dt), 'x', ncol(dt), ' quoting ', quote))
timeIt <- function(func) {
f <- tempfile()
t0 <- get_nanotime()
func(dt, f)
time_diff <- (get_nanotime() - t0) * 1e-9
file.remove(f)
time_diff
}
alternatives = list(
fwrite = function(dt, f) {
fwrite(dt, f, quote=quote)
},
write.csv = function(dt, f) {
write.csv(dt, f, quote=quote, row.names=FALSE)
}
)
n_rounds <- 3
results <- rbindlist(
lapply(1:n_rounds,
function (i) {
round <- list()
for (alt in names(alternatives)) {
message(alt, ' ', i)
round[[alt]] <- timeIt(alternatives[[alt]])
}
round
}
)
)
means <- results[, lapply(.SD, mean)][, lapply(.SD, as.numeric)]
fwrite_time <- means[, fwrite]
speedup <- means[, write.csv] / fwrite_time
list(speedup=speedup, fwrite_time=fwrite_time)
}
performanceTable <- function() {
tab <- data.table(expand.grid(list(
`col.type` = c('character', 'double', 'integer', 'factor', 'mixed'),
`quote` = c(TRUE, FALSE)
)))
for (j in 1:nrow(tab)) {
r <- performanceTest(col.type=tab[j, `col.type`], quote=tab[j, `quote`])
tab[j, speedup := r$speedup]
tab[j, fwrite_time := r$fwrite_time]
}
tab
}
@oseiskar
Copy link
Author

Updated code to use microbenchmark::get_nanotime() (thanks @MichaelChirico ).
Results are similar.

col.type quote speedup fwrite_time (s)
character TRUE 2.00 8.91
double TRUE 3.25 17.85
integer TRUE 4.35 4.92
factor TRUE 2.21 8.5
mixed TRUE 2.91 10.31
character FALSE 2.17 7.57
factor FALSE 2.30 7.75
mixed FALSE 3.03 9.68

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment