Last active

Embed URL

HTTPS clone URL

SSH clone URL

You can clone with HTTPS or SSH.

Download Gist

data.table version of rbind.fill benchmarking with plyr version of rbind.fill

View rbind_fill_benchmarking
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
# The post with benchmarking results is the link given below:
# http://stackoverflow.com/questions/18003717/is-there-any-efficient-way-than-rbind-filllist/18004698#18004698
 
# This is the script with which the benchmarking and plots were generated in case anyone else wants to replicate it.
# Note: it takes about 2-3 hours for the benchmarking to finish.
 
require(plyr)
require(data.table)
require(ggplot2)
require(microbenchmark)
 
# data.table version of rbind.fill (first/rough version, improvements should be possible)
rbind.fill.DT <- function(ll) {
all.names <- lapply(ll, names) # changed sapply to lapply to return a list always
unq.names <- unique(unlist(all.names))
ll.m <- rbindlist(lapply(seq_along(ll), function(x) {
tt <- ll[[x]]
setattr(tt, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(tt, 0L)
invisible(alloc.col(tt))
tt[, c(unq.names[!unq.names %chin% all.names[[x]]]) := NA_character_]
setcolorder(tt, unq.names)
}))
}
 
# plyr rbind.fill
rbind.fill.PLYR <- function(ll) {
rbind.fill(ll)
}
 
# Function to generate sample data of varying list length
set.seed(45)
sample.fun <- function() {
nam <- sample(LETTERS, sample(5:15))
val <- data.frame(matrix(sample(letters, length(nam)*10,replace=TRUE),nrow=10))
setNames(val, nam)
}
 
vals <- seq(1000, 10000, by=1000)
timings <- rbindlist(lapply(vals, function(x) {
print(x)
ll <- replicate(x, sample.fun())
ww <- microbenchmark(t1 <- rbind.fill.DT(ll),
t2 <- rbind.fill.PLYR(ll), times=10)
data.table(Time = ww$time/1e9,
Type = ww$expr)[, list(Median = median(Time),
Max = max(Time), Min = min(Time)), by=Type]
}))
 
# This is the data I obtained after the time-consuming run
> timings
# Type Median Max Min
# 1: t1 <- rbind.fill.DT(ll) 0.9954604 1.048112 0.9857269
# 2: t2 <- rbind.fill.PLYR(ll) 1.2641116 1.338004 1.2212289
# 3: t2 <- rbind.fill.PLYR(ll) 8.4721998 8.735089 5.9465172
# 4: t1 <- rbind.fill.DT(ll) 2.0714826 2.197611 2.0515592
# 5: t2 <- rbind.fill.PLYR(ll) 17.6816796 36.607170 15.4232463
# 6: t1 <- rbind.fill.DT(ll) 3.3759282 3.420131 3.2041830
# 7: t2 <- rbind.fill.PLYR(ll) 34.2991906 94.734541 23.3022808
# 8: t1 <- rbind.fill.DT(ll) 4.6182805 5.113477 4.5497483
# 9: t2 <- rbind.fill.PLYR(ll) 40.7826024 123.284631 32.9824400
# 10: t1 <- rbind.fill.DT(ll) 5.4668071 6.506895 5.1054583
# 11: t2 <- rbind.fill.PLYR(ll) 54.8779719 158.539573 41.0948270
# 12: t1 <- rbind.fill.DT(ll) 6.9966963 7.298266 6.7938445
# 13: t1 <- rbind.fill.DT(ll) 7.8084107 8.513016 7.2517920
# 14: t2 <- rbind.fill.PLYR(ll) 70.8803392 154.592278 62.8290179
# 15: t2 <- rbind.fill.PLYR(ll) 113.2118155 145.739713 91.2401254
# 16: t1 <- rbind.fill.DT(ll) 8.6552054 10.291779 8.2413973
# 17: t2 <- rbind.fill.PLYR(ll) 136.0497395 283.447510 102.8063142
# 18: t1 <- rbind.fill.DT(ll) 10.4087178 11.862445 10.0815655
# 19: t1 <- rbind.fill.DT(ll) 11.8507923 12.687897 11.4170676
# 20: t2 <- rbind.fill.PLYR(ll) 202.2200270 328.348136 171.4381696
 
# plot it
timings[, Type := ifelse(grepl("DT", Type), "DT", "PLYR")][, List_Length := rep(seq(1000, 1e4, by=1000), each=2)]
pp <- ggplot(data = timings, aes(x = List_Length, y = Median, colour = Type)) + geom_line() + geom_point()
pp
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.