Skip to content

Instantly share code, notes, and snippets.

@akhilsbehl
Created February 13, 2014 12:10
Show Gist options
  • Save akhilsbehl/8973994 to your computer and use it in GitHub Desktop.
Save akhilsbehl/8973994 to your computer and use it in GitHub Desktop.
library(dplyr)
library(data.table)
library(microbenchmark)
###########################################################################
# FUNCTIONALITY #
###########################################################################
set.seed(pi)
samp = data.frame(x=runif(1e7, 2, 4),
y=rnorm(1e7, mean=runif(1), sd=runif(1, 1, 2)),
z=letters[sample.int(26, 1e7, TRUE)],
w=LETTERS[sample.int(26, 1e7, TRUE)],
stringsAsFactors=FALSE)
# The process:
# 1. Filter the samp on the first half of `letters' and the second half of
# LETTERS and the second half of LETTERS.
# 2. Select only columns x, y, and z out of the data.frame.
# 3. Create two new columns:
# 3.1 xProp = x / sum(x)
# 3.2 yScale = (y - mean(y)) / sd(y)
# 4. Calculate mean(xProp) and mean(yScale) by z.
# 5. Arrange / Order output by letters.
############
# Base R #
############
baseR = samp[samp[["z"]] %in% letters[1:20] &
samp[["w"]] %in% LETTERS[7:26],
c("x", "y", "z")]
baseR[["xProp"]] = with(baseR, x / sum(x))
baseR[["yScale"]] = with(baseR, (y - mean(y)) / sd(y))
baseR = by(baseR, baseR[["z"]],
function (x)
c("meanXProp"=mean(x[["xProp"]]),
"meanYScale"=mean(x[["yScale"]])))
baseR = do.call("rbind", baseR)
###########
# dplyr #
###########
dply =
samp %.%
filter(z %in% letters[1:20], w %in% LETTERS[7:26]) %.%
select(x, y, z) %.%
mutate(xProp=x / sum(x),
yScale=(y - mean(y)) / sd(y)) %.%
group_by(z) %.%
summarise(meanXProp=mean(xProp), meanYScale=mean(yScale)) %.%
arrange(z)
################
# data.table #
################
dt = data.table(samp)
dt = dt[z %in% letters[1:20] & w %in% LETTERS[7:26], list(x, y, z)]
dt = dt[ , list(xProp=(x / sum(x)), yScale=(y - mean(y)) / sd(y), z)]
dt = dt[ , list(meanXProp=mean(xProp), meanYScale=mean(yScale)), by=z]
dt = dt[order(dt[["z"]]), ]
###########################################################################
# BENCHMARKING #
###########################################################################
set.seed(pi)
samp = data.frame(x=runif(1e7, 2, 4),
y=rnorm(1e7, mean=runif(1), sd=runif(1, 1, 2)),
z=letters[sample.int(26, 1e7, TRUE)],
w=LETTERS[sample.int(26, 1e7, TRUE)],
stringsAsFactors=FALSE)
dtSamp = data.table(samp)
mbc = microbenchmark({
baseR = samp[samp[["z"]] %in% letters[1:20] &
samp[["w"]] %in% LETTERS[7:26],
c("x", "y", "z")]
baseR[["xProp"]] = with(baseR, x / sum(x))
baseR[["yScale"]] = with(baseR, (y - mean(y)) / sd(y))
baseR = by(baseR, baseR[["z"]],
function (x)
c("meanXProp"=mean(x[["xProp"]]),
"meanYScale"=mean(x[["yScale"]])))
baseR = do.call("rbind", baseR)
},
{
dply =
samp %.%
filter(z %in% letters[1:20], w %in% LETTERS[7:26]) %.%
select(x, y, z) %.%
mutate(xProp=x / sum(x),
yScale=(y - mean(y)) / sd(y)) %.%
group_by(z) %.%
summarise(meanXProp=mean(xProp), meanYScale=mean(yScale)) %.%
arrange(z)
},
{
dt = dtSamp[z %in% letters[1:20] & w %in% LETTERS[7:26], list(x, y, z)]
dt = dt[ , list(xProp=(x / sum(x)), yScale=(y - mean(y)) / sd(y), z)]
dt = dt[ , list(meanXProp=mean(xProp), meanYScale=mean(yScale)), by=z]
dt = dt[order(dt[["z"]]), ]
},
times=10L)
print(mbc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment