Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save arunsrinivasan/7846014 to your computer and use it in GitHub Desktop.
Save arunsrinivasan/7846014 to your computer and use it in GitHub Desktop.
Benchmarking dplyr and data.table 1.8.11 commit 1048 (with lesser groups)
# version 1.8.11 (commit 1048)
require(data.table)
# Loading required package: data.table
# data.table 1.8.11 For help type: help("data.table")
## create a huge data.table:
## -------------------------
set.seed(1)
N <- 2e7 # size of DT
# generate a character vector of length about 1e5
foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
ch <- replicate(1e3, foo())
ch <- unique(ch)
# > length(ch)
# [1] 1000
# DT now
DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e3)*1e6), N, TRUE)),
b = as.numeric(sample(rnorm(1e4), N, TRUE)),
c = sample(c(-1000:1000), N, TRUE),
d = sample(ch, N, TRUE))
DT.cp <- copy(DT)
system.time(setkey(DT.cp, c))
# user system elapsed
# 5.931 0.276 6.340
tables() # memory footprint
# NAME NROW MB COLS KEY
# [1,] DT 20,000,000 535 a,b,c,d c
# [2,] DT.cp 20,000,000 535 a,b,c,d c
# Total: 1,070MB
require(dplyr) # as of 6th December
# creating grouped_df from 'dplyr'
DF <- tbl_df(data.frame(DT))
system.time(DF.cp <- group_by(DF, c))
# user system elapsed
# 5.166 1.171 6.394
# memory footprint
print(object.size(DF), units='Mb') # 534.1 Mb
print(object.size(DF.cp), units='Mb') # 534.1 Mb
## Borrowing timing function from Hadley:
## --------------------------------------
benchmark <- function(code) {
code <- substitute(code)
rbind(
system.time(eval(code, parent.frame())),
system.time(eval(code, parent.frame())),
system.time(eval(code, parent.frame()))
)
}
## ----------------------------------------------------------------------------------
## 1) Comparing "filter" from dplyr with data.table: (on unkey'd / ungrouped data)
## ----------------------------------------------------------------------------------
# 1a) DF vector-scan subset
benchmark(DF[DF$d == "ewdjgq", ])
# user.self sys.self elapsed user.child sys.child
# [1,] 4.097 0.358 4.531 0 0
# [2,] 3.970 0.008 4.053 0 0
# [3,] 3.959 0.004 3.980 0 0
benchmark(DF[DF$c == 169073, ])
# user.self sys.self elapsed user.child sys.child
# [1,] 3.398 0.110 3.546 0 0
# [2,] 3.369 0.106 3.489 0 0
# [3,] 3.377 0.110 3.557 0 0
# 1b) ordinary DT vector-scan subset
benchmark(DT[d == "ewdjgq"])
# user.self sys.self elapsed user.child sys.child
# [1,] 0.749 0.003 0.775 0 0
# [2,] 0.746 0.002 0.756 0 0
# [3,] 0.748 0.002 0.763 0 0
benchmark(DT[c == 169073])
# user.self sys.self elapsed user.child sys.child
# [1,] 0.169 0.104 0.274 0 0
# [2,] 0.171 0.105 0.277 0 0
# [3,] 0.172 0.108 0.282 0 0
# 1c) dplyr's 'filter'
benchmark(filter(DF, d == "ewdjgq"))
# user.self sys.self elapsed user.child sys.child
# [1,] 0.845 0.002 0.913 0 0
# [2,] 0.840 0.001 0.847 0 0
# [3,] 0.843 0.001 0.847 0 0
benchmark(filter(DF, c == 169073))
# user.self sys.self elapsed user.child sys.child
# [1,] 0.268 0.111 0.379 0 0
# [2,] 0.266 0.111 0.377 0 0
# [3,] 0.268 0.106 0.374 0 0
## ------------------------------------------------------------------------------
## 2) Comparing "filter" from dplyr with data.table: (on key'd / grouped data)
## ------------------------------------------------------------------------------
# 1a) DF vector-scan subset
benchmark(DF.cp[DF.cp$d == "ewdjgq", ])
# user.self sys.self elapsed user.child sys.child
# [1,] 4.002 0.015 4.481 0 0
# [2,] 3.941 0.005 3.974 0 0
# [3,] 3.959 0.004 3.993 0 0
benchmark(DF.cp[DF.cp$c == 169073, ])
# user.self sys.self elapsed user.child sys.child
# [1,] 3.420 0.120 3.690 0 0
# [2,] 3.415 0.121 3.696 0 0
# [3,] 3.429 0.119 3.723 0 0
# 1b) ordinary DT vector-scan subset
benchmark(DT.cp[d == "ewdjgq"])
# user.self sys.self elapsed user.child sys.child
# [1,] 0.746 0.003 0.779 0 0
# [2,] 0.744 0.003 0.825 0 0
# [3,] 0.744 0.004 1.011 0 0
benchmark(DT.cp[c == 169073])
# user.self sys.self elapsed user.child sys.child
# [1,] 0.171 0.106 0.279 0 0
# [2,] 0.173 0.107 0.280 0 0
# [3,] 0.169 0.104 0.274 0 0
# 1c) dplyr's 'filter'
benchmark(filter(DF.cp, d == "ewdjgq"))
# user.self sys.self elapsed user.child sys.child
# [1,] 0.943 0.050 1.017 0 0
# [2,] 0.943 0.049 1.002 0 0
# [3,] 0.942 0.046 1.054 0 0
benchmark(filter(DF.cp, c == 169073))
# user.self sys.self elapsed user.child sys.child
# [1,] 0.351 0.157 0.510 0 0
# [2,] 0.347 0.150 0.497 0 0
# [3,] 0.350 0.144 0.504 0 0
# 1d) data.table's binary search
benchmark(DT.cp[J(169073)])
# user.self sys.self elapsed user.child sys.child
# [1,] 0.003 0 0.003 0 0
# [2,] 0.002 0 0.002 0 0
# [3,] 0.002 0 0.002 0 0
# 1e) dplyr's join approach (doesn't use keys though):
benchmark(inner_join(DF.cp, data.frame(c = 169073L), by = c("c")))
# user.self sys.self elapsed user.child sys.child
# [1,] 1.054 0.095 1.200 0 0
# [2,] 1.038 0.091 1.206 0 0
# [3,] 1.032 0.087 1.129 0 0
## -------------------------------------------------------------
## 3) Comparing "arrange" (ordering) from dplyr with data.table:
## -------------------------------------------------------------
benchmark(arrange(DF.cp, b,c))
# user.self sys.self elapsed user.child sys.child
# [1,] 41.444 0.503 43.481 0 0
# [2,] 40.557 0.458 41.569 0 0
# [3,] 40.066 0.433 40.995 0 0
benchmark(setkey(copy(DT), b,c))
# user.self sys.self elapsed user.child sys.child
# [1,] 11.047 1.452 12.616 0 0
# [2,] 11.032 0.886 12.154 0 0
# [3,] 11.490 0.953 14.389 0 0
## -------------------------------------------------
## 4) Comparing "mutate" from dplyr with data.table:
## -------------------------------------------------
# The logical equivalent of 'mutate' is ':=' or 'set' to me... 'mutate' seems to create a NAM(2) object,
# where as ':=' (or 'set') modifies the same object by reference.
# to make the comparison fair, I'll use 'set' on a copy' everytime.
benchmark(mutate(DF.cp, e=a+b))
# user.self sys.self elapsed user.child sys.child
# [1,] 0.218 0.195 0.416 0 0
# [2,] 0.216 0.193 0.411 0 0
# [3,] 0.222 0.189 0.419 0 0
# run this 3 times
setkey(DT.cp <- copy(DT), c)
system.time(set(DT.cp, i=NULL, j="e", value=DT.cp$a+DT.cp$b))
# user system elapsed
# 0.234 0.212 0.448
# 0.235 0.205 0.450
# 0.239 0.212 0.461
## --------------------------------------------------------------------------
## 5) Comparing "join" from dplyr with data.table: (on character column here)
## --------------------------------------------------------------------------
DF.cp <- group_by(DF, d)
setkey(DT.cp <- copy(DT), d)
set.seed(1)
DF.j <- data.frame(d = sample(ch, 1e2, FALSE), stringsAsFactors=FALSE)
DT.j <- data.table(DF.j) # no key on DT.j
benchmark(left_join(DF.j, DF.cp, by="d"))
# user.self sys.self elapsed user.child sys.child
# [1,] 1.359 0.197 1.571 0 0
# [2,] 1.371 0.164 1.613 0 0
# [3,] 1.389 0.174 1.720 0 0
benchmark(DT.cp[DT.j])
# user.self sys.self elapsed user.child sys.child
# [1,] 0.139 0.013 0.153 0 0
# [2,] 0.139 0.004 0.151 0 0
# [3,] 0.140 0.004 0.147 0 0
## ----------------------------------------------------------------------------------
## 6) Comparing "summarise" from dplyr with data.table: (grouped by character column) - ~9e4 unique groups
## ----------------------------------------------------------------------------------
# with the groupings on character col. "d"
# 6a. with C-run function of dplyr
benchmark(summarise(DF.cp, m.b = sum(b)))
# user.self sys.self elapsed user.child sys.child
# [1,] 0.202 0.000 0.203 0 0
# [2,] 0.197 0.000 0.199 0 0
# [3,] 0.200 0.001 0.210 0 0
# 6b. evaluating the function instead
sum__ <- sum
benchmark(summarise(DF.cp, m.b = sum__(b)))
# user.self sys.self elapsed user.child sys.child
# [1,] 0.202 0.122 0.324 0 0
# [2,] 0.207 0.129 0.353 0 0
# [3,] 0.207 0.127 0.340 0 0
# 6c. data.table way
benchmark(DT.cp[, list(m.b=sum(b)), by=d])
# user.self sys.self elapsed user.child sys.child
# [1,] 0.460 0.112 0.575 0 0
# [2,] 0.454 0.117 0.572 0 0
# [3,] 0.451 0.117 0.568 0 0
## ----------------------------------------------------------------------------------
## 7) Comparing "summarise" from dplyr with data.table: (grouped by integer column) - ~9e5 unique groups
## ----------------------------------------------------------------------------------
# with the groupings on character col. "d"
# 7a. with C-run function of dplyr
DF.cp <- group_by(DF, c)
benchmark(summarise(DF.cp, m.b = sum(b)))
# user.self sys.self elapsed user.child sys.child
# [1,] 0.199 0.000 0.204 0 0
# [2,] 0.197 0.000 0.198 0 0
# [3,] 0.200 0.001 0.200 0 0
# 7b. evaluating the function instead
sum__ <- sum
benchmark(summarise(DF.cp, m.b = sum__(b)))
# user.self sys.self elapsed user.child sys.child
# [1,] 0.203 0.095 0.298 0 0
# [2,] 0.205 0.097 0.303 0 0
# [3,] 0.206 0.089 0.297 0 0
# 7c. data.table way
setkey(DT.cp, c)
benchmark(DT.cp[, list(m.b=sum(b)), by=c])
# user.self sys.self elapsed user.child sys.child
# [1,] 0.424 0.002 0.443 0 0
# [2,] 0.415 0.000 0.427 0 0
# [3,] 0.413 0.000 0.415 0 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment