Skip to content

Instantly share code, notes, and snippets.

@zjf
Created March 28, 2014 02:53
Show Gist options
  • Save zjf/9824238 to your computer and use it in GitHub Desktop.
Save zjf/9824238 to your computer and use it in GitHub Desktop.
data.table - boosting performance on selecting rows based on specific column values.
library(data.table)
library(rbenchmark)
library(doMC)
registerDoMC(4)
set.seed(42)
nrows = 1e5
nsearch = 1e3
d = data.frame(a = sample(letters, nrows, rep = T), b = sample(1:nrows, rep = T), c = runif(nrows))
d.dt = data.table(d, key = 'a,b')
d.search = d[sample(1:nrow(d), nsearch), 1:2]
f_naive <- function(d, d.search){
res = numeric(nrow(d.search))
for(i in 1:nrow(d.search)){
d.tmp = d[d[, 1] == d.search[i, 1] & d[, 2] == d.search[i, 2], ]
res[i] = sum(d.tmp[, 3])
}
res
}
f_dt_vector <- function(d.dt, d.search){
res = numeric(nrow(d.search))
for(i in 1:nrow(d.search)){
res[i] = d.dt[a == d.search[i, 1] & b == d.search[i, 2], sum(c)]
}
res
}
f_dt_binary <- function(d.dt, d.search){
res = numeric(nrow(d.search))
for(i in 1:nrow(d.search)){
res[i] = d.dt[J(d.search[i, 1], d.search[i, 2]), ][, sum(c)]
}
res
}
f_dt_binary_par <- function(d.dt, d.search){
res = foreach(i = icount(nrow(d.search)), .combine = c) %dopar% {
d.dt[J(d.search[i, 1], d.search[i, 2]), ][, sum(c)]
}
res
}
benchmark(r1 = f_naive(d, d.search),
r2 = f_dt_vector(d.dt, d.search),
r3 = f_dt_binary(d.dt, d.search),
r4 = f_dt_binary_par(d.dt, d.search),
replications = 10,
columns = c("test", "elapsed", "relative", "replications"),
order = "elapsed")
## test elapsed relative replications
## 4 r4 18.070 1.000 10
## 3 r3 22.339 1.236 10
## 2 r2 126.049 6.976 10
## 1 r1 212.796 11.776 10
all(identical(r1, r2), identical(r1, r3), identical(r1, r4))
## [1] TRUE
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment