public
Created

Comparing 1.8.11 to 1.8.10

  • Download Gist
DT_1.8.10vs1.8.11.R
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
# version 1.8.11 (commit 1048)
require(data.table)
# Loading required package: data.table
# data.table 1.8.11 For help type: help("data.table")
 
## create a huge data.table:
## -------------------------
set.seed(1)
N <- 2e7 # size of DT
 
# generate a character vector of length about 1e5
foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
ch <- replicate(1e5, foo())
ch <- unique(ch)
 
# > length(ch)
# [1] 99982
 
# DT now
DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)),
b = as.numeric(sample(rnorm(1e6), N, TRUE)),
c = sample(c(NA_integer_, 1e5:1e6), N, TRUE),
d = sample(ch, N, TRUE))
 
# > tables()
# NAME NROW MB COLS KEY
# [1,] DT 20,000,000 539 a,b,c,d
# Total: 539MB
 
## testing 'setkey' on column 2 (numeric)
## --------------------------------------
# waiting for my laptop to cool down... and running again
DT.cp = copy(DT)
system.time(setkey(DT.cp, b)) # takes 55 seconds on 1.8.10
# user system elapsed
# 6.778 0.359 7.204
 
# setkey on integer column - c
DT.cp = copy(DT)
system.time(setkey(DT.cp, c)) # takes 48 seconds on 1.8.10
# user system elapsed
# 6.756 0.175 6.985
 
## testing 'setkey' on 2 columns
## -----------------------------
# setkey on numeric columns - a,b
DT.cp = copy(DT)
system.time(setkey(DT.cp, a,b)) # takes 102 seconds on 1.8.10
# user system elapsed
# 15.289 1.131 16.934
 
 
## testing 'cold' by aggregation
## -----------------------------
DT.cp = copy(DT)
DT[, mean(b), by=c]
# user system elapsed # takes 47 seconds on 1.8.10
# 7.948 0.441 8.708

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.