Skip to content

Instantly share code, notes, and snippets.

@arunsrinivasan
Last active December 30, 2015 12:59
Show Gist options
  • Save arunsrinivasan/7832436 to your computer and use it in GitHub Desktop.
Save arunsrinivasan/7832436 to your computer and use it in GitHub Desktop.
1.8.10 : Benchmark: comparison between data.table 1.8.10 and 1.8.11 commit 1048
# version 1.8.10
require(data.table)
# Loading required package: data.table
# data.table 1.8.10 For help type: help("data.table")
## create a huge data.table:
## -------------------------
set.seed(1)
N <- 2e7 # size of DT
# generate a character vector of length about 1e5
foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
ch <- replicate(1e5, foo())
ch <- unique(ch)
# > length(ch)
# [1] 99982
# DT now
DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)),
b = as.numeric(sample(rnorm(1e6), N, TRUE)),
c = sample(c(NA_integer_, 1e5:1e6), N, TRUE),
d = sample(ch, N, TRUE))
# > tables()
# NAME NROW MB COLS KEY
# [1,] DT 20,000,000 539 a,b,c,d
# Total: 539MB
## testing 'setkey' on 1 column
## ----------------------------
# setkey on numeric columns - a,b
DT.cp = copy(DT)
system.time(setkey(DT.cp, a))
# user system elapsed
# 51.491 0.426 53.457
# waiting for my laptop to cool down... and running again
DT.cp = copy(DT)
system.time(setkey(DT.cp, b))
# user system elapsed
# 51.030 0.353 54.953
# setkey on integer column - c
# R's base radix sort won't help here.. will resort to regularorder1
DT.cp = copy(DT)
system.time(setkey(DT.cp, c))
# user system elapsed
# 44.500 0.368 48.012
# setkey on character column - d
# should be *very* quick
DT.cp = copy(DT)
system.time(setkey(DT.cp, d))
# user system elapsed
# 9.451 0.178 9.706
## testing 'setkey' on 2 columns
## -----------------------------
# setkey on numeric columns - a,b
DT.cp = copy(DT)
system.time(setkey(DT.cp, a,b))
# user system elapsed
# 98.412 0.736 102.319
# setkey on integer+numeric columns - c,a
DT.cp = copy(DT)
system.time(setkey(DT.cp, c,a))
# user system elapsed
# 90.407 0.507 91.512
# setkey on character+numeric columns - d,a
DT.cp = copy(DT)
system.time(setkey(DT.cp, d,a))
# user system elapsed
# 58.047 0.414 59.491
# setkey on numeric+integer columns - a,c
DT.cp = copy(DT)
system.time(setkey(DT.cp, a,c))
# user system elapsed
# 92.641 0.872 100.080
# setkey on numeric+character columns - a,d
DT.cp = copy(DT)
system.time(setkey(DT.cp, a,d))
# user system elapsed
# 58.607 0.602 63.171
# setkey on integer+character columns - c,d
DT.cp = copy(DT)
system.time(setkey(DT.cp, c,d))
# user system elapsed
# 52.147 0.483 55.035
# setkey on character+integer columns - d,c
DT.cp = copy(DT)
system.time(setkey(DT.cp, d,c))
# user system elapsed
# 53.119 0.659 57.779
## Other tests which are not by reference so that we can run them more than once...
## --------------------------------------------------------------------------------
## Borrowing timing function from Hadley
benchmark <- function(code) {
code <- substitute(code)
rbind(
system.time(eval(code, parent.frame())),
system.time(eval(code, parent.frame())),
system.time(eval(code, parent.frame()))
)
}
# filtering/subsetting : vector-scan approach - without key
benchmark(DT[d == "pvuyrlxw"])
# user.self sys.self elapsed user.child sys.child
# [1,] 6.018 0.329 6.625 0 0
# [2,] 5.939 0.091 6.243 0 0
# [3,] 5.900 0.019 6.169 0 0
# filtering/subsetting : vector-scan approach - with key
DT.cp <- copy(DT)
setkey(DT.cp, d)
benchmark(DT.cp[d == "pvuyrlxw"])
user.self sys.self elapsed user.child sys.child
# [1,] 2.803 0.009 2.965 0 0
# [2,] 2.790 0.010 2.978 0 0
# [3,] 2.794 0.009 2.897 0 0
# Interesting : setting key helps in vector-scanning... >2x speed-up
# binary search approach
benchmark(DT.cp["pvuyrlxw"])
# user.self sys.self elapsed user.child sys.child
# [1,] 0.002 0 0.002 0 0
# [2,] 0.002 0 0.002 0 0
# [3,] 0.002 0 0.002 0 0
# summarising : without key - column "c" - for simplicity
benchmark(DT[, mean(b), by=c]) ########################### 900000 groups ######################
# user.self sys.self elapsed user.child sys.child
# [1,] 45.167 0.357 47.287 0 0
# [2,] 45.148 0.444 47.796 0 0
# [3,] 45.971 0.435 48.342 0 0
# summarising : with key - column "c" - for simplicity
DT.cp <- copy(DT)
setkey(DT.cp, c)
benchmark(DT.cp[, mean(b), by=c]) ########################### 900000 groups ######################
# user.self sys.self elapsed user.child sys.child
# [1,] 1.800 0.034 1.887 0 0
# [2,] 2.615 0.035 2.790 0 0
# [3,] 1.804 0.030 1.949 0 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment