Skip to content

Instantly share code, notes, and snippets.

@mrdwab
Created September 28, 2014 05:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mrdwab/6282a022b2eadb0fe1c6 to your computer and use it in GitHub Desktop.
Save mrdwab/6282a022b2eadb0fe1c6 to your computer and use it in GitHub Desktop.
## Required packages
library(data.table)
library(dplyr)
library(reshape2)
library(microbenchmark)
## Small sample `data.frame`
dat <- structure(list(Pos = c("Position1", "Position2", "Position3",
"Position1", "Position3", "Position2"), M = c(34L, 45L, 89L,
56L, 54L, 56L), P = c(56L, 23L, 78L, 45L, 35L, 89L)), .Names = c("Pos",
"M", "P"), class = "data.frame", row.names = c(NA, -6L))
## Small sample `matrix`
m1 <- structure(c(34, 45, 89, 56, 54, 56, 56, 23, 78, 45, 35, 89), .Dim = c(6L,
2L), .Dimnames = list(c("Position1", "Position2", "Position3",
"Position1", "Position3", "Position2"), c("M", "P")))
## Built up to ~ 1M rows
dat <- do.call(rbind, replicate(16667, dat, FALSE))
m1 <- do.call(rbind, replicate(16667, m1, FALSE))
## rnso's double aggregate + merge
funRNSO <- function() {
a1 = aggregate(M ~ Pos, dat, sum)
a2 = aggregate(P ~ Pos, dat, sum)
merge(a1, a2)
}
## my suggestion in comments
funRNSOb <- function() aggregate(. ~ Pos, dat, sum)
## Richard Scriven
funRS <- function() t(sapply(split(dat[-1], dat$Pos), colSums))
## Mine
funAM <- function() xtabs(Freq ~ Var1 + Var2, data.frame(as.table(m1)))
## Akrun - dplyr
funAdplyr <- function() {
dat %>%
group_by(Pos) %>%
summarise_each(funs(sum=sum(., na.rm=TRUE)))
}
## Akrun - data.table
funAdt <- function() {
as.data.table(dat)[, lapply(.SD, sum, na.rm=TRUE), by=Pos]
}
## Akrun - by
funAby <- function() {
do.call(rbind, by(m1, list(rownames(m1)), colSums, na.rm=TRUE))
}
## Akrun - acast
funAacast <- function() {
acast(melt(m1), Var1~Var2, value.var="value", sum, na.rm=TRUE)
}
system.time(funRNSO())
# user system elapsed
# 3.20 0.00 3.22
system.time(funRNSOb())
# user system elapsed
# 1.72 0.00 1.72
## ^^ Imagine if we had to aggregate more than 2 columns!
microbenchmark(tapply = funRS(),
xtabs = funAM(),
dplyr = funAdplyr(),
datatable = funAdt(),
by = funAby(),
acast = funAacast(),
times = 20)
# Unit: milliseconds
# expr min lq median uq max neval
# tapply 155.003662 164.664556 175.49503 209.98393 275.68028 20
# xtabs 463.060971 520.557225 580.92654 611.13589 720.41927 20
# dplyr 17.555662 19.042972 21.12384 24.53657 29.02181 20
# datatable 7.277477 8.253766 11.49202 18.26767 26.67410 20
# by 232.103088 255.488470 288.51957 318.80311 378.95220 20
# acast 159.206876 172.837925 213.22048 253.78435 331.10764 20
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment