public
Last active

data.table version of rbind.fill benchmarking with plyr version of rbind.fill

  • Download Gist
rbind_fill_benchmarking
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
# The post with benchmarking results is the link given below:
# http://stackoverflow.com/questions/18003717/is-there-any-efficient-way-than-rbind-filllist/18004698#18004698
 
# This is the script with which the benchmarking and plots were generated in case anyone else wants to replicate it.
# Note: it takes about 2-3 hours for the benchmarking to finish.
 
require(plyr)
require(data.table)
require(ggplot2)
require(microbenchmark)
 
# data.table version of rbind.fill (first/rough version, improvements should be possible)
rbind.fill.DT <- function(ll) {
all.names <- lapply(ll, names) # changed sapply to lapply to return a list always
unq.names <- unique(unlist(all.names))
ll.m <- rbindlist(lapply(seq_along(ll), function(x) {
tt <- ll[[x]]
setattr(tt, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(tt, 0L)
invisible(alloc.col(tt))
tt[, c(unq.names[!unq.names %chin% all.names[[x]]]) := NA_character_]
setcolorder(tt, unq.names)
}))
}
 
# plyr rbind.fill
rbind.fill.PLYR <- function(ll) {
rbind.fill(ll)
}
 
# Function to generate sample data of varying list length
set.seed(45)
sample.fun <- function() {
nam <- sample(LETTERS, sample(5:15))
val <- data.frame(matrix(sample(letters, length(nam)*10,replace=TRUE),nrow=10))
setNames(val, nam)
}
 
vals <- seq(1000, 10000, by=1000)
timings <- rbindlist(lapply(vals, function(x) {
print(x)
ll <- replicate(x, sample.fun())
ww <- microbenchmark(t1 <- rbind.fill.DT(ll),
t2 <- rbind.fill.PLYR(ll), times=10)
data.table(Time = ww$time/1e9,
Type = ww$expr)[, list(Median = median(Time),
Max = max(Time), Min = min(Time)), by=Type]
}))
 
# This is the data I obtained after the time-consuming run
> timings
# Type Median Max Min
# 1: t1 <- rbind.fill.DT(ll) 0.9954604 1.048112 0.9857269
# 2: t2 <- rbind.fill.PLYR(ll) 1.2641116 1.338004 1.2212289
# 3: t2 <- rbind.fill.PLYR(ll) 8.4721998 8.735089 5.9465172
# 4: t1 <- rbind.fill.DT(ll) 2.0714826 2.197611 2.0515592
# 5: t2 <- rbind.fill.PLYR(ll) 17.6816796 36.607170 15.4232463
# 6: t1 <- rbind.fill.DT(ll) 3.3759282 3.420131 3.2041830
# 7: t2 <- rbind.fill.PLYR(ll) 34.2991906 94.734541 23.3022808
# 8: t1 <- rbind.fill.DT(ll) 4.6182805 5.113477 4.5497483
# 9: t2 <- rbind.fill.PLYR(ll) 40.7826024 123.284631 32.9824400
# 10: t1 <- rbind.fill.DT(ll) 5.4668071 6.506895 5.1054583
# 11: t2 <- rbind.fill.PLYR(ll) 54.8779719 158.539573 41.0948270
# 12: t1 <- rbind.fill.DT(ll) 6.9966963 7.298266 6.7938445
# 13: t1 <- rbind.fill.DT(ll) 7.8084107 8.513016 7.2517920
# 14: t2 <- rbind.fill.PLYR(ll) 70.8803392 154.592278 62.8290179
# 15: t2 <- rbind.fill.PLYR(ll) 113.2118155 145.739713 91.2401254
# 16: t1 <- rbind.fill.DT(ll) 8.6552054 10.291779 8.2413973
# 17: t2 <- rbind.fill.PLYR(ll) 136.0497395 283.447510 102.8063142
# 18: t1 <- rbind.fill.DT(ll) 10.4087178 11.862445 10.0815655
# 19: t1 <- rbind.fill.DT(ll) 11.8507923 12.687897 11.4170676
# 20: t2 <- rbind.fill.PLYR(ll) 202.2200270 328.348136 171.4381696
 
# plot it
timings[, Type := ifelse(grepl("DT", Type), "DT", "PLYR")][, List_Length := rep(seq(1000, 1e4, by=1000), each=2)]
pp <- ggplot(data = timings, aes(x = List_Length, y = Median, colour = Type)) + geom_line() + geom_point()
pp

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.