mrdwab/SO26049028.R

## SO26049028.R
## Required packages
library(data.table)
library(dplyr)
library(reshape2)
library(microbenchmark)

## Small sample `data.frame`
dat <- structure(list(Pos = c("Position1", "Position2", "Position3",
"Position1", "Position3", "Position2"), M = c(34L, 45L, 89L,
56L, 54L, 56L), P = c(56L, 23L, 78L, 45L, 35L, 89L)), .Names = c("Pos",
"M", "P"), class = "data.frame", row.names = c(NA, -6L))

## Small sample `matrix`
m1 <- structure(c(34, 45, 89, 56, 54, 56, 56, 23, 78, 45, 35, 89), .Dim = c(6L,
2L), .Dimnames = list(c("Position1", "Position2", "Position3",
"Position1", "Position3", "Position2"), c("M", "P")))

## Built up to ~ 1M rows
dat <- do.call(rbind, replicate(16667, dat, FALSE))
m1 <- do.call(rbind, replicate(16667, m1, FALSE))


## rnso's double aggregate + merge
funRNSO <- function() {
  a1 = aggregate(M ~ Pos, dat, sum)
  a2 = aggregate(P ~ Pos, dat, sum)
  merge(a1, a2)
}

## my suggestion in comments
funRNSOb <- function() aggregate(. ~ Pos, dat, sum)

## Richard Scriven
funRS <- function() t(sapply(split(dat[-1], dat$Pos), colSums))

## Mine
funAM <- function() xtabs(Freq ~ Var1 + Var2, data.frame(as.table(m1)))

## Akrun - dplyr
funAdplyr <- function() {
  dat %>%
    group_by(Pos) %>%
    summarise_each(funs(sum=sum(., na.rm=TRUE)))
}

## Akrun - data.table
funAdt <- function() {
  as.data.table(dat)[, lapply(.SD, sum, na.rm=TRUE), by=Pos]
}

## Akrun - by
funAby <- function() {
  do.call(rbind, by(m1, list(rownames(m1)), colSums, na.rm=TRUE))
}

## Akrun - acast
funAacast <- function() {
  acast(melt(m1), Var1~Var2, value.var="value", sum, na.rm=TRUE)
}

system.time(funRNSO())
#    user  system elapsed
#    3.20    0.00    3.22
system.time(funRNSOb())
#    user  system elapsed
#    1.72    0.00    1.72

## ^^ Imagine if we had to aggregate more than 2 columns!

microbenchmark(tapply = funRS(),
               xtabs = funAM(),
               dplyr = funAdplyr(),
               datatable = funAdt(),
               by = funAby(),
               acast = funAacast(),
               times = 20)
# Unit: milliseconds
#       expr        min         lq    median        uq       max neval
#     tapply 155.003662 164.664556 175.49503 209.98393 275.68028    20
#      xtabs 463.060971 520.557225 580.92654 611.13589 720.41927    20
#      dplyr  17.555662  19.042972  21.12384  24.53657  29.02181    20
#  datatable   7.277477   8.253766  11.49202  18.26767  26.67410    20
#         by 232.103088 255.488470 288.51957 318.80311 378.95220    20
#      acast 159.206876 172.837925 213.22048 253.78435 331.10764    20
	## Required packages
	library(data.table)
	library(dplyr)
	library(reshape2)
	library(microbenchmark)

	## Small sample `data.frame`
	dat <- structure(list(Pos = c("Position1", "Position2", "Position3",
	"Position1", "Position3", "Position2"), M = c(34L, 45L, 89L,
	56L, 54L, 56L), P = c(56L, 23L, 78L, 45L, 35L, 89L)), .Names = c("Pos",
	"M", "P"), class = "data.frame", row.names = c(NA, -6L))

	## Small sample `matrix`
	m1 <- structure(c(34, 45, 89, 56, 54, 56, 56, 23, 78, 45, 35, 89), .Dim = c(6L,
	2L), .Dimnames = list(c("Position1", "Position2", "Position3",
	"Position1", "Position3", "Position2"), c("M", "P")))

	## Built up to ~ 1M rows
	dat <- do.call(rbind, replicate(16667, dat, FALSE))
	m1 <- do.call(rbind, replicate(16667, m1, FALSE))


	## rnso's double aggregate + merge
	funRNSO <- function() {
	a1 = aggregate(M ~ Pos, dat, sum)
	a2 = aggregate(P ~ Pos, dat, sum)
	merge(a1, a2)
	}

	## my suggestion in comments
	funRNSOb <- function() aggregate(. ~ Pos, dat, sum)

	## Richard Scriven
	funRS <- function() t(sapply(split(dat[-1], dat$Pos), colSums))

	## Mine
	funAM <- function() xtabs(Freq ~ Var1 + Var2, data.frame(as.table(m1)))

	## Akrun - dplyr
	funAdplyr <- function() {
	dat %>%
	group_by(Pos) %>%
	summarise_each(funs(sum=sum(., na.rm=TRUE)))
	}

	## Akrun - data.table
	funAdt <- function() {
	as.data.table(dat)[, lapply(.SD, sum, na.rm=TRUE), by=Pos]
	}

	## Akrun - by
	funAby <- function() {
	do.call(rbind, by(m1, list(rownames(m1)), colSums, na.rm=TRUE))
	}

	## Akrun - acast
	funAacast <- function() {
	acast(melt(m1), Var1~Var2, value.var="value", sum, na.rm=TRUE)
	}

	system.time(funRNSO())
	# user system elapsed
	# 3.20 0.00 3.22
	system.time(funRNSOb())
	# user system elapsed
	# 1.72 0.00 1.72

	## ^^ Imagine if we had to aggregate more than 2 columns!

	microbenchmark(tapply = funRS(),
	xtabs = funAM(),
	dplyr = funAdplyr(),
	datatable = funAdt(),
	by = funAby(),
	acast = funAacast(),
	times = 20)
	# Unit: milliseconds
	# expr min lq median uq max neval
	# tapply 155.003662 164.664556 175.49503 209.98393 275.68028 20
	# xtabs 463.060971 520.557225 580.92654 611.13589 720.41927 20
	# dplyr 17.555662 19.042972 21.12384 24.53657 29.02181 20
	# datatable 7.277477 8.253766 11.49202 18.26767 26.67410 20
	# by 232.103088 255.488470 288.51957 318.80311 378.95220 20
	# acast 159.206876 172.837925 213.22048 253.78435 331.10764 20