Skip to content

Instantly share code, notes, and snippets.

@arunsrinivasan
Created December 17, 2013 00:03
Show Gist options
  • Save arunsrinivasan/7997521 to your computer and use it in GitHub Desktop.
Save arunsrinivasan/7997521 to your computer and use it in GitHub Desktop.
A small comparison between 'dplyr' and 'data.table'
# version 1.8.11
require(data.table)
# Loading required package: data.table
# data.table 1.8.11 For help type: help("data.table")
## create a huge data.table:
## -------------------------
set.seed(1)
N <- 2e7 # size of DT
# generate a character vector of length about 1e5
foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
ch <- replicate(1e5, foo())
ch <- unique(ch)
# > length(ch)
# [1] 99982
# DT now
DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)),
b = as.numeric(sample(rnorm(1e6), N, TRUE)),
c = sample(c(NA_integer_, 1e5:1e6), N, TRUE),
d = sample(ch, N, TRUE))
## setkey on data.table v 1.8.11
## ----------------------------
DT.cp <- copy(DT)
system.time(setkey(DT.cp, c))
# user system elapsed
# 6.945 0.196 7.312
## equivalent of setkey in dplyr (group_by) - as of december 6th
## ------------------------------------------------------------
require(dplyr)
# creating grouped_df from 'dplyr'
DF <- tbl_df(data.frame(DT))
system.time(DF.cp <- group_by(DF, c))
# user system elapsed
# 21.803 1.780 24.970
## setkey on data.table takes 7.3 seconds where as group_by on dplyr takes 24.97 seconds!
## dplyr requires group_by to be able to "summarise" data. Benchmarks for summarise are shown below.
## Borrowing timing function from Hadley:
## --------------------------------------
benchmark <- function(code) {
code <- substitute(code)
rbind(
system.time(eval(code, parent.frame())),
system.time(eval(code, parent.frame())),
system.time(eval(code, parent.frame()))
)
}
## -------------------------------------------------
## Comparing "summarise" from dplyr with data.table: - ~9e5 unique groups
## -------------------------------------------------
### ----------------------------------------------------------------
### NOTE THAT 'dplyr' CAN NOT RUN THIS WITHOUT 'group_by' FIRST ####
### ----------------------------------------------------------------
# a. runs entirely in C/C++ - hybrid evaluator...
DF.cp <- group_by(DF, c)
benchmark(summarise(DF.cp, m.b = sum(b)))
# user.self sys.self elapsed user.child sys.child
# [1,] 0.205 0.000 0.205 0 0 ### fast but 'group_by' is still 3x slower (24.97 vs 7.3 sec)
# [2,] 0.199 0.001 0.199 0 0
# [3,] 0.198 0.000 0.202 0 0
# b. evaluating the function instead (similar to how data.table does it)
sum__ <- sum
benchmark(summarise(DF.cp, m.b = sum__(b)))
# user.self sys.self elapsed user.child sys.child
# [1,] 1.601 0.074 1.693 0 0
# [2,] 1.564 0.069 1.660 0 0
# [3,] 3.226 0.078 3.397 0 0
# c. data.table way (with key being set)
setkey(DT.cp, c)
benchmark(DT.cp[, list(m.b=sum(b)), by=c])
# user.self sys.self elapsed user.child sys.child
# [1,] 1.822 0.006 1.894 0 0
# [2,] 1.817 0.005 1.846 0 0
# [3,] 1.837 0.008 1.916 0 0
### THIS IS NOT POSSIBLE USING 'dplyr'
# d. data.table way - 'cold' by - it doesn't require key being set
DT.cp <- copy(DT)
benchmark(DT.cp[, list(m.b=sum(b)), by=c])
# user.self sys.self elapsed user.child sys.child
# [1,] 9.018 0.519 10.466 0 0 ### group_by alone takes 24.97 seconds in 'dplyr'
# [2,] 8.943 0.454 9.822 0 0
# [3,] 8.062 0.412 8.726 0 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment