arunsrinivasan/rbind_fill_benchmarking

## rbind_fill_benchmarking
# The post with benchmarking results is the link given below:
# http://stackoverflow.com/questions/18003717/is-there-any-efficient-way-than-rbind-filllist/18004698#18004698

# This is the script with which the benchmarking and plots were generated in case anyone else wants to replicate it.
# Note: it takes about 2-3 hours for the benchmarking to finish.

require(plyr)
require(data.table)
require(ggplot2)
require(microbenchmark)

# data.table version of rbind.fill (first/rough version, improvements should be possible)
rbind.fill.DT <- function(ll) {
    all.names <- lapply(ll, names) # changed sapply to lapply to return a list always
    unq.names <- unique(unlist(all.names))
    ll.m <- rbindlist(lapply(seq_along(ll), function(x) {
        tt <- ll[[x]]
        setattr(tt, 'class', c('data.table', 'data.frame'))
        data.table:::settruelength(tt, 0L)
        invisible(alloc.col(tt))
        tt[, c(unq.names[!unq.names %chin% all.names[[x]]]) := NA_character_]
        setcolorder(tt, unq.names)
    }))
}

# plyr rbind.fill
rbind.fill.PLYR <- function(ll) {
    rbind.fill(ll)
}

# Function to generate sample data of varying list length
set.seed(45)
sample.fun <- function() {
   nam <- sample(LETTERS, sample(5:15))
   val <- data.frame(matrix(sample(letters, length(nam)*10,replace=TRUE),nrow=10))
   setNames(val, nam)
}

vals <- seq(1000, 10000, by=1000)
timings <- rbindlist(lapply(vals, function(x) {
   print(x)
   ll <- replicate(x, sample.fun())
   ww <- microbenchmark(t1 <- rbind.fill.DT(ll),
            t2 <- rbind.fill.PLYR(ll), times=10)
   data.table(Time = ww$time/1e9,
      Type = ww$expr)[, list(Median = median(Time),
      Max = max(Time), Min = min(Time)), by=Type]
}))

# This is the data I obtained after the time-consuming run
> timings
#                         Type      Median        Max         Min
#  1:   t1 <- rbind.fill.DT(ll)   0.9954604   1.048112   0.9857269
#  2: t2 <- rbind.fill.PLYR(ll)   1.2641116   1.338004   1.2212289
#  3: t2 <- rbind.fill.PLYR(ll)   8.4721998   8.735089   5.9465172
#  4:   t1 <- rbind.fill.DT(ll)   2.0714826   2.197611   2.0515592
#  5: t2 <- rbind.fill.PLYR(ll)  17.6816796  36.607170  15.4232463
#  6:   t1 <- rbind.fill.DT(ll)   3.3759282   3.420131   3.2041830
#  7: t2 <- rbind.fill.PLYR(ll)  34.2991906  94.734541  23.3022808
#  8:   t1 <- rbind.fill.DT(ll)   4.6182805   5.113477   4.5497483
#  9: t2 <- rbind.fill.PLYR(ll)  40.7826024 123.284631  32.9824400
# 10:   t1 <- rbind.fill.DT(ll)   5.4668071   6.506895   5.1054583
# 11: t2 <- rbind.fill.PLYR(ll)  54.8779719 158.539573  41.0948270
# 12:   t1 <- rbind.fill.DT(ll)   6.9966963   7.298266   6.7938445
# 13:   t1 <- rbind.fill.DT(ll)   7.8084107   8.513016   7.2517920
# 14: t2 <- rbind.fill.PLYR(ll)  70.8803392 154.592278  62.8290179
# 15: t2 <- rbind.fill.PLYR(ll) 113.2118155 145.739713  91.2401254
# 16:   t1 <- rbind.fill.DT(ll)   8.6552054  10.291779   8.2413973
# 17: t2 <- rbind.fill.PLYR(ll) 136.0497395 283.447510 102.8063142
# 18:   t1 <- rbind.fill.DT(ll)  10.4087178  11.862445  10.0815655
# 19:   t1 <- rbind.fill.DT(ll)  11.8507923  12.687897  11.4170676
# 20: t2 <- rbind.fill.PLYR(ll) 202.2200270 328.348136 171.4381696

# plot it
timings[, Type := ifelse(grepl("DT", Type), "DT", "PLYR")][, List_Length := rep(seq(1000, 1e4, by=1000), each=2)]
pp <- ggplot(data = timings, aes(x = List_Length, y = Median, colour = Type)) + geom_line() + geom_point()
pp
	# The post with benchmarking results is the link given below:
	# http://stackoverflow.com/questions/18003717/is-there-any-efficient-way-than-rbind-filllist/18004698#18004698

	# This is the script with which the benchmarking and plots were generated in case anyone else wants to replicate it.
	# Note: it takes about 2-3 hours for the benchmarking to finish.

	require(plyr)
	require(data.table)
	require(ggplot2)
	require(microbenchmark)

	# data.table version of rbind.fill (first/rough version, improvements should be possible)
	rbind.fill.DT <- function(ll) {
	all.names <- lapply(ll, names) # changed sapply to lapply to return a list always
	unq.names <- unique(unlist(all.names))
	ll.m <- rbindlist(lapply(seq_along(ll), function(x) {
	tt <- ll[[x]]
	setattr(tt, 'class', c('data.table', 'data.frame'))
	data.table:::settruelength(tt, 0L)
	invisible(alloc.col(tt))
	tt[, c(unq.names[!unq.names %chin% all.names[[x]]]) := NA_character_]
	setcolorder(tt, unq.names)
	}))
	}

	# plyr rbind.fill
	rbind.fill.PLYR <- function(ll) {
	rbind.fill(ll)
	}

	# Function to generate sample data of varying list length
	set.seed(45)
	sample.fun <- function() {
	nam <- sample(LETTERS, sample(5:15))
	val <- data.frame(matrix(sample(letters, length(nam)*10,replace=TRUE),nrow=10))
	setNames(val, nam)
	}

	vals <- seq(1000, 10000, by=1000)
	timings <- rbindlist(lapply(vals, function(x) {
	print(x)
	ll <- replicate(x, sample.fun())
	ww <- microbenchmark(t1 <- rbind.fill.DT(ll),
	t2 <- rbind.fill.PLYR(ll), times=10)
	data.table(Time = ww$time/1e9,
	Type = ww$expr)[, list(Median = median(Time),
	Max = max(Time), Min = min(Time)), by=Type]
	}))

	# This is the data I obtained after the time-consuming run
	> timings
	# Type Median Max Min
	# 1: t1 <- rbind.fill.DT(ll) 0.9954604 1.048112 0.9857269
	# 2: t2 <- rbind.fill.PLYR(ll) 1.2641116 1.338004 1.2212289
	# 3: t2 <- rbind.fill.PLYR(ll) 8.4721998 8.735089 5.9465172
	# 4: t1 <- rbind.fill.DT(ll) 2.0714826 2.197611 2.0515592
	# 5: t2 <- rbind.fill.PLYR(ll) 17.6816796 36.607170 15.4232463
	# 6: t1 <- rbind.fill.DT(ll) 3.3759282 3.420131 3.2041830
	# 7: t2 <- rbind.fill.PLYR(ll) 34.2991906 94.734541 23.3022808
	# 8: t1 <- rbind.fill.DT(ll) 4.6182805 5.113477 4.5497483
	# 9: t2 <- rbind.fill.PLYR(ll) 40.7826024 123.284631 32.9824400
	# 10: t1 <- rbind.fill.DT(ll) 5.4668071 6.506895 5.1054583
	# 11: t2 <- rbind.fill.PLYR(ll) 54.8779719 158.539573 41.0948270
	# 12: t1 <- rbind.fill.DT(ll) 6.9966963 7.298266 6.7938445
	# 13: t1 <- rbind.fill.DT(ll) 7.8084107 8.513016 7.2517920
	# 14: t2 <- rbind.fill.PLYR(ll) 70.8803392 154.592278 62.8290179
	# 15: t2 <- rbind.fill.PLYR(ll) 113.2118155 145.739713 91.2401254
	# 16: t1 <- rbind.fill.DT(ll) 8.6552054 10.291779 8.2413973
	# 17: t2 <- rbind.fill.PLYR(ll) 136.0497395 283.447510 102.8063142
	# 18: t1 <- rbind.fill.DT(ll) 10.4087178 11.862445 10.0815655
	# 19: t1 <- rbind.fill.DT(ll) 11.8507923 12.687897 11.4170676
	# 20: t2 <- rbind.fill.PLYR(ll) 202.2200270 328.348136 171.4381696

	# plot it
	timings[, Type := ifelse(grepl("DT", Type), "DT", "PLYR")][, List_Length := rep(seq(1000, 1e4, by=1000), each=2)]
	pp <- ggplot(data = timings, aes(x = List_Length, y = Median, colour = Type)) + geom_line() + geom_point()
	pp