Here's the code for comparing base R (v3.0.3
and 3.1.0
vs data.table
's sub-assignment by reference feature):
require(data.table)
set.seed(20140430)
N <- as.integer(10^(3:7)*2L)
ans = vector("list", length(N))
for (i in seq_along(N)) {
print(i)
nreg = N[i]
df <- as.data.frame(matrix(rep(NA,nreg*7L),nrow=nreg,ncol=7))
df[1e3,] <- c(1:5,"a","b")
val = c(1:5, "a", "b")
t1 = summary(replicate(10, system.time(df[sample(1:nreg,1),] <- val)[3]))
dt <- as.data.table(df)
val = as.list(val)
t2 = summary(replicate(10L,
system.time(set(dt, i=sample(1:nreg, 1L), j=1:length(dt), value=val))[3L]
))
gc()
ans[[i]] = data.table(type=names(t1), N=nreg, base_303=unname(t1), dt=unname(t2))
}
ans = rbindlist(ans)
And here's the timing results that's used to generate the plots here on SO.
## R v3.0.3 vs data.table
> ans
type N base_303 dt
1: Min. 2000 0.00000 0.00000
2: 1st Qu. 2000 0.00100 0.00000
3: Median 2000 0.00100 0.00000
4: Mean 2000 0.00080 0.00020
5: 3rd Qu. 2000 0.00100 0.00000
6: Max. 2000 0.00100 0.00100
7: Min. 20000 0.00300 0.00000
8: 1st Qu. 20000 0.00300 0.00000
9: Median 20000 0.00300 0.00000
10: Mean 20000 0.00360 0.00000
11: 3rd Qu. 20000 0.00375 0.00000
12: Max. 20000 0.00600 0.00000
13: Min. 200000 0.03700 0.00000
14: 1st Qu. 200000 0.03825 0.00000
15: Median 200000 0.03900 0.00100
16: Mean 200000 0.07800 0.00060
17: 3rd Qu. 200000 0.04150 0.00100
18: Max. 200000 0.42100 0.00100
19: Min. 2000000 1.00500 0.00400
20: 1st Qu. 2000000 1.04400 0.00400
21: Median 2000000 1.07000 0.00450
22: Mean 2000000 1.18100 0.00450
23: 3rd Qu. 2000000 1.09400 0.00500
24: Max. 2000000 2.13400 0.00500
25: Min. 20000000 10.05000 0.04200
26: 1st Qu. 20000000 10.27000 0.04225
27: Median 20000000 10.73000 0.04350
28: Mean 20000000 11.52000 0.04400
29: 3rd Qu. 20000000 11.14000 0.04400
30: Max. 20000000 18.51000 0.04900
type N base_303 dt
## R v3.1.0 vs data.table
> ans
type N base_303 dt
1: Min. 2000 0.0010 0.00000
2: 1st Qu. 2000 0.0010 0.00000
3: Median 2000 0.0010 0.00000
4: Mean 2000 0.0011 0.00040
5: 3rd Qu. 2000 0.0010 0.00100
6: Max. 2000 0.0020 0.00100
7: Min. 20000 0.0010 0.00000
8: 1st Qu. 20000 0.0010 0.00000
9: Median 20000 0.0020 0.00000
10: Mean 20000 0.0016 0.00010
11: 3rd Qu. 20000 0.0020 0.00000
12: Max. 20000 0.0020 0.00100
13: Min. 200000 0.0130 0.00000
14: 1st Qu. 200000 0.0140 0.00000
15: Median 200000 0.0140 0.00000
16: Mean 200000 0.0236 0.00020
17: 3rd Qu. 200000 0.0140 0.00000
18: Max. 200000 0.1120 0.00100
19: Min. 2000000 0.1530 0.00400
20: 1st Qu. 2000000 0.1652 0.00425
21: Median 2000000 0.1715 0.00500
22: Mean 2000000 0.2196 0.00470
23: 3rd Qu. 2000000 0.1773 0.00500
24: Max. 2000000 0.6620 0.00500
25: Min. 20000000 1.6720 0.04200
26: 1st Qu. 20000000 1.8780 0.04300
27: Median 20000000 1.9650 0.04300
28: Mean 20000000 2.3750 0.04520
29: 3rd Qu. 20000000 2.3310 0.04800
30: Max. 20000000 5.2010 0.05100
type N base_303 dt
require(ggplot2)
require(data.table)
ans.c1 = dcast.data.table(ans, N ~ type, value.var="base_303")
ans.c2 = dcast.data.table(ans, N ~ type, value.var="dt")
ans.f = rbindlist(list(ans.c1, ans.c2))[, Mean := NULL]
setnames(ans.f, c("size", "Q25", "Q75", "Max", "Med", "Min"))
ans.f[, type := factor(rep(c("base_310", "dt"), each=5L))]
## missed this change - legend shows 3.0.3 wrongly for 3.1.0, it's okay.
# remember to name the plots accordingly - 3.0.3 or 3.1.0!!
ggplot(ans.f, aes(x=size, y=Med)) + geom_line(aes(colour=type)) +
geom_point(aes(colour=type), size=4) + xlab("size") +
ylab("Median time (in seconds)") +
scale_y_continuous(breaks=seq(0, 10, by=1L)) +
scale_x_continuous(breaks=ans.f$size) +
ggtitle("R 3.1.0 vs data.table") + coord_trans(x="log") +
theme(text = element_text(size=26))