arunsrinivasan/DT_1.8.11_1048_benchmark.R

## DT_1.8.11_1048_benchmark.R
# version 1.8.11 (commit 1048)
require(data.table)
# Loading required package: data.table
# data.table 1.8.11  For help type: help("data.table")

## create a huge data.table:
## -------------------------
set.seed(1)
N <- 2e7 # size of DT

# generate a character vector of length about 1e5
foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
ch <- replicate(1e5, foo())
ch <- unique(ch)

# > length(ch)
# [1] 99982

# DT now
DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)),
                 b = as.numeric(sample(rnorm(1e6), N, TRUE)),
                 c = sample(c(NA_integer_, 1e5:1e6), N, TRUE),
                 d = sample(ch, N, TRUE))

# > tables()
#      NAME       NROW  MB COLS    KEY
# [1,] DT   20,000,000 539 a,b,c,d
# Total: 539MB

## testing 'setkey' on 1 column
## ----------------------------
# setkey on numeric columns - a,b
DT.cp = copy(DT)
system.time(setkey(DT.cp, a))
#   user  system elapsed
#  8.097   0.414   8.599

# waiting for my laptop to cool down... and running again
DT.cp = copy(DT)
system.time(setkey(DT.cp, b))
#   user  system elapsed
#  6.778   0.359   7.204

# setkey on integer column - c
DT.cp = copy(DT)
system.time(setkey(DT.cp, c))
#   user  system elapsed
#  6.756   0.175   6.985

# setkey on character column - d
# should be *very* quick
DT.cp = copy(DT)
system.time(setkey(DT.cp, d))
#   user  system elapsed
# 10.152   0.225  11.438

## testing 'setkey' on 2 columns
## -----------------------------
# setkey on numeric columns - a,b
DT.cp = copy(DT)
system.time(setkey(DT.cp, a,b))
#   user  system elapsed
# 15.289   1.131  16.934

# setkey on integer+numeric columns - c,a
DT.cp = copy(DT)
system.time(setkey(DT.cp, c,a))
#   user  system elapsed
# 13.037   0.891  15.278

# setkey on character+numeric columns - d,a
DT.cp = copy(DT)
system.time(setkey(DT.cp, d,a))
#   user  system elapsed
# 16.948   0.743  18.824

# setkey on numeric+integer columns - a,c
DT.cp = copy(DT)
system.time(setkey(DT.cp, a,c))
#   user  system elapsed
# 12.135   0.805  13.848

# setkey on numeric+character columns - a,d
DT.cp = copy(DT)
system.time(setkey(DT.cp, a,d))
#   user  system elapsed
# 15.078   0.745  16.425

# setkey on integer+character columns - c,d
DT.cp = copy(DT)
system.time(setkey(DT.cp, c,d))
#   user  system elapsed
# 14.840   0.625  17.685

# setkey on character+integer columns - d,c
DT.cp = copy(DT)
system.time(setkey(DT.cp, d,c))
#   user  system elapsed
# 16.275   0.505  17.397

## Other tests which are not by reference so that we can run them more than once...
## --------------------------------------------------------------------------------

## Borrowing timing function from Hadley
benchmark <- function(code) {
  code <- substitute(code)

  rbind(
    system.time(eval(code, parent.frame())),
    system.time(eval(code, parent.frame())),
    system.time(eval(code, parent.frame()))
  )
}

# filtering/subsetting : vector-scan approach - without key
benchmark(DT[d == "pvuyrlxw"])
#      user.self sys.self elapsed user.child sys.child
# [1,]     2.295    0.077   2.437          0         0
# [2,]     2.276    0.079   2.402          0         0
# [3,]     2.324    0.014   2.552          0         0

# filtering/subsetting : vector-scan approach - with key
DT.cp <- copy(DT)
setkey(DT.cp, d)
benchmark(DT.cp[d == "pvuyrlxw"])
#      user.self sys.self elapsed user.child sys.child
# [1,]     0.734    0.004   0.817          0         0
# [2,]     0.742    0.002   0.780          0         0
# [3,]     0.737    0.003   0.773          0         0

# 1.8.11 is about 3 times faster in vector-scan approach than 1.8.10 (see https://gist.github.com/arunsrinivasan/7832436)
# Interesting : setting key helps in vector-scanning... >2x speed-up

# binary search approach
benchmark(DT.cp["pvuyrlxw"])
#      user.self sys.self elapsed user.child sys.child
# [1,]     0.002        0   0.002          0         0
# [2,]     0.003        0   0.002          0         0
# [3,]     0.002        0   0.003          0         0

# summarising : without key - column "c" - for simplicity
benchmark(DT[, mean(b), by=c]) ########################### 900000 groups ######################
#      user.self sys.self elapsed user.child sys.child
# [1,]     8.127    0.344   9.224          0         0
# [2,]     8.053    0.457   8.730          0         0
# [3,]     8.047    0.458   8.719          0         0

# grouping is about 5 times faster in 1.8.11 than in 1.8.10! with no key

# summarising : with key - column "c" - for simplicity
DT.cp <- copy(DT)
setkey(DT.cp, c) # <~~~ 7 seconds
benchmark(DT.cp[, mean(b), by=c]) ########################### 900000 groups ######################
#      user.self sys.self elapsed user.child sys.child
# [1,]     1.786    0.011   1.987          0         0
# [2,]     1.773    0.009   1.831          0         0
# [3,]     1.776    0.014   2.152          0         0

## testing aggregation with `dplyr` (other tests will follow later)
## ----------------------------------------------------------------

require(dplyr)
DF <- tbl_df(data.frame(DT))
system.time(DF <- group_by(DF, c))
#   user  system elapsed
# 21.532   1.550  23.896

benchmark(summarise(DF, mean(b)))
#      user.self sys.self elapsed user.child sys.child
# [1,]     0.648    0.007   0.675          0         0
# [2,]     0.641    0.002   0.680          0         0
# [3,]     0.638    0.002   0.662          0         0

# Note: here "mean(b)" is run entirely in C (Very clever implementation from Romain).
# And it seems to be 3x faster. If we were to replace this function with another
# function that's not implemented in C, but simple enough, then:

sum__ <- sum
benchmark(summarise(DF, sum__(b)))
#      user.self sys.self elapsed user.child sys.child
# [1,]     1.540    0.163   1.813          0         0
# [2,]     1.536    0.153   1.746          0         0
# [3,]     1.530    0.159   1.728          0         0

# doing the same on data.table to compare - sum is not optimised in DT - and it's a primitive.
benchmark(DT.cp[, sum(b), by=c])
#      user.self sys.self elapsed user.child sys.child
# [1,]     1.446    0.016   1.561          0         0
# [2,]     1.431    0.007   1.505          0         0
# [3,]     1.436    0.007   1.483          0         0

# Seems like `data.table` marginally edges over the (current) version of  `dplyr`.
	# version 1.8.11 (commit 1048)
	require(data.table)
	# Loading required package: data.table
	# data.table 1.8.11 For help type: help("data.table")

	## create a huge data.table:
	## -------------------------
	set.seed(1)
	N <- 2e7 # size of DT

	# generate a character vector of length about 1e5
	foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
	ch <- replicate(1e5, foo())
	ch <- unique(ch)

	# > length(ch)
	# [1] 99982

	# DT now
	DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)),
	b = as.numeric(sample(rnorm(1e6), N, TRUE)),
	c = sample(c(NA_integer_, 1e5:1e6), N, TRUE),
	d = sample(ch, N, TRUE))

	# > tables()
	# NAME NROW MB COLS KEY
	# [1,] DT 20,000,000 539 a,b,c,d
	# Total: 539MB

	## testing 'setkey' on 1 column
	## ----------------------------
	# setkey on numeric columns - a,b
	DT.cp = copy(DT)
	system.time(setkey(DT.cp, a))
	# user system elapsed
	# 8.097 0.414 8.599

	# waiting for my laptop to cool down... and running again
	DT.cp = copy(DT)
	system.time(setkey(DT.cp, b))
	# user system elapsed
	# 6.778 0.359 7.204

	# setkey on integer column - c
	DT.cp = copy(DT)
	system.time(setkey(DT.cp, c))
	# user system elapsed
	# 6.756 0.175 6.985

	# setkey on character column - d
	# should be very quick
	DT.cp = copy(DT)
	system.time(setkey(DT.cp, d))
	# user system elapsed
	# 10.152 0.225 11.438

	## testing 'setkey' on 2 columns
	## -----------------------------
	# setkey on numeric columns - a,b
	DT.cp = copy(DT)
	system.time(setkey(DT.cp, a,b))
	# user system elapsed
	# 15.289 1.131 16.934

	# setkey on integer+numeric columns - c,a
	DT.cp = copy(DT)
	system.time(setkey(DT.cp, c,a))
	# user system elapsed
	# 13.037 0.891 15.278

	# setkey on character+numeric columns - d,a
	DT.cp = copy(DT)
	system.time(setkey(DT.cp, d,a))
	# user system elapsed
	# 16.948 0.743 18.824

	# setkey on numeric+integer columns - a,c
	DT.cp = copy(DT)
	system.time(setkey(DT.cp, a,c))
	# user system elapsed
	# 12.135 0.805 13.848

	# setkey on numeric+character columns - a,d
	DT.cp = copy(DT)
	system.time(setkey(DT.cp, a,d))
	# user system elapsed
	# 15.078 0.745 16.425

	# setkey on integer+character columns - c,d
	DT.cp = copy(DT)
	system.time(setkey(DT.cp, c,d))
	# user system elapsed
	# 14.840 0.625 17.685

	# setkey on character+integer columns - d,c
	DT.cp = copy(DT)
	system.time(setkey(DT.cp, d,c))
	# user system elapsed
	# 16.275 0.505 17.397

	## Other tests which are not by reference so that we can run them more than once...
	## --------------------------------------------------------------------------------

	## Borrowing timing function from Hadley
	benchmark <- function(code) {
	code <- substitute(code)

	rbind(
	system.time(eval(code, parent.frame())),
	system.time(eval(code, parent.frame())),
	system.time(eval(code, parent.frame()))
	)
	}

	# filtering/subsetting : vector-scan approach - without key
	benchmark(DT[d == "pvuyrlxw"])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 2.295 0.077 2.437 0 0
	# [2,] 2.276 0.079 2.402 0 0
	# [3,] 2.324 0.014 2.552 0 0

	# filtering/subsetting : vector-scan approach - with key
	DT.cp <- copy(DT)
	setkey(DT.cp, d)
	benchmark(DT.cp[d == "pvuyrlxw"])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.734 0.004 0.817 0 0
	# [2,] 0.742 0.002 0.780 0 0
	# [3,] 0.737 0.003 0.773 0 0

	# 1.8.11 is about 3 times faster in vector-scan approach than 1.8.10 (see https://gist.github.com/arunsrinivasan/7832436)
	# Interesting : setting key helps in vector-scanning... >2x speed-up

	# binary search approach
	benchmark(DT.cp["pvuyrlxw"])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.002 0 0.002 0 0
	# [2,] 0.003 0 0.002 0 0
	# [3,] 0.002 0 0.003 0 0

	# summarising : without key - column "c" - for simplicity
	benchmark(DT[, mean(b), by=c]) ########################### 900000 groups ######################
	# user.self sys.self elapsed user.child sys.child
	# [1,] 8.127 0.344 9.224 0 0
	# [2,] 8.053 0.457 8.730 0 0
	# [3,] 8.047 0.458 8.719 0 0

	# grouping is about 5 times faster in 1.8.11 than in 1.8.10! with no key

	# summarising : with key - column "c" - for simplicity
	DT.cp <- copy(DT)
	setkey(DT.cp, c) # <~~~ 7 seconds
	benchmark(DT.cp[, mean(b), by=c]) ########################### 900000 groups ######################
	# user.self sys.self elapsed user.child sys.child
	# [1,] 1.786 0.011 1.987 0 0
	# [2,] 1.773 0.009 1.831 0 0
	# [3,] 1.776 0.014 2.152 0 0

	## testing aggregation with `dplyr` (other tests will follow later)
	## ----------------------------------------------------------------

	require(dplyr)
	DF <- tbl_df(data.frame(DT))
	system.time(DF <- group_by(DF, c))
	# user system elapsed
	# 21.532 1.550 23.896

	benchmark(summarise(DF, mean(b)))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.648 0.007 0.675 0 0
	# [2,] 0.641 0.002 0.680 0 0
	# [3,] 0.638 0.002 0.662 0 0

	# Note: here "mean(b)" is run entirely in C (Very clever implementation from Romain).
	# And it seems to be 3x faster. If we were to replace this function with another
	# function that's not implemented in C, but simple enough, then:

	sum__ <- sum
	benchmark(summarise(DF, sum__(b)))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 1.540 0.163 1.813 0 0
	# [2,] 1.536 0.153 1.746 0 0
	# [3,] 1.530 0.159 1.728 0 0

	# doing the same on data.table to compare - sum is not optimised in DT - and it's a primitive.
	benchmark(DT.cp[, sum(b), by=c])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 1.446 0.016 1.561 0 0
	# [2,] 1.431 0.007 1.505 0 0
	# [3,] 1.436 0.007 1.483 0 0

	# Seems like `data.table` marginally edges over the (current) version of `dplyr`.