akhilsbehl/dplyr-data.table.R

## dplyr-data.table.R
library(dplyr)
library(data.table)
library(microbenchmark)

###########################################################################
#                              FUNCTIONALITY                              #
###########################################################################

set.seed(pi)
samp = data.frame(x=runif(1e7, 2, 4),
                    y=rnorm(1e7, mean=runif(1), sd=runif(1, 1, 2)),
                    z=letters[sample.int(26, 1e7, TRUE)],
                    w=LETTERS[sample.int(26, 1e7, TRUE)],
                    stringsAsFactors=FALSE)

# The process:
# 1. Filter the samp on the first half of `letters' and the second half of
#    LETTERS and the second half of LETTERS.
# 2. Select only columns x, y, and z out of the data.frame.
# 3. Create two new columns:
#    3.1 xProp = x / sum(x)
#    3.2 yScale = (y - mean(y)) / sd(y)
# 4. Calculate mean(xProp) and mean(yScale) by z.
# 5. Arrange / Order output by letters.

############
#  Base R  #
############

baseR = samp[samp[["z"]] %in% letters[1:20] &
             samp[["w"]] %in% LETTERS[7:26],
             c("x", "y", "z")]
baseR[["xProp"]] = with(baseR, x / sum(x))
baseR[["yScale"]] = with(baseR, (y - mean(y)) / sd(y))
baseR = by(baseR, baseR[["z"]],
           function (x)
             c("meanXProp"=mean(x[["xProp"]]),
               "meanYScale"=mean(x[["yScale"]])))
baseR = do.call("rbind", baseR)

###########
#  dplyr  #
###########

dply =
  samp %.%
  filter(z %in% letters[1:20], w %in% LETTERS[7:26]) %.%
  select(x, y, z) %.%
  mutate(xProp=x / sum(x),
         yScale=(y - mean(y)) / sd(y)) %.%
  group_by(z) %.%
  summarise(meanXProp=mean(xProp), meanYScale=mean(yScale)) %.%
  arrange(z)

################
#  data.table  #
################

dt = data.table(samp)
dt = dt[z %in% letters[1:20] & w %in% LETTERS[7:26], list(x, y, z)]
dt = dt[ , list(xProp=(x / sum(x)), yScale=(y - mean(y)) / sd(y), z)]
dt = dt[ , list(meanXProp=mean(xProp), meanYScale=mean(yScale)), by=z]
dt = dt[order(dt[["z"]]), ]

###########################################################################
#                              BENCHMARKING                               #
###########################################################################

set.seed(pi)
samp = data.frame(x=runif(1e7, 2, 4),
                    y=rnorm(1e7, mean=runif(1), sd=runif(1, 1, 2)),
                    z=letters[sample.int(26, 1e7, TRUE)],
                    w=LETTERS[sample.int(26, 1e7, TRUE)],
                    stringsAsFactors=FALSE)
dtSamp = data.table(samp)

mbc = microbenchmark({
  baseR = samp[samp[["z"]] %in% letters[1:20] &
               samp[["w"]] %in% LETTERS[7:26],
               c("x", "y", "z")]
  baseR[["xProp"]] = with(baseR, x / sum(x))
  baseR[["yScale"]] = with(baseR, (y - mean(y)) / sd(y))
  baseR = by(baseR, baseR[["z"]],
             function (x)
               c("meanXProp"=mean(x[["xProp"]]),
                 "meanYScale"=mean(x[["yScale"]])))
  baseR = do.call("rbind", baseR)
},
{
  dply =
  samp %.%
  filter(z %in% letters[1:20], w %in% LETTERS[7:26]) %.%
  select(x, y, z) %.%
  mutate(xProp=x / sum(x),
         yScale=(y - mean(y)) / sd(y)) %.%
  group_by(z) %.%
  summarise(meanXProp=mean(xProp), meanYScale=mean(yScale)) %.%
  arrange(z)
},
{
  dt = dtSamp[z %in% letters[1:20] & w %in% LETTERS[7:26], list(x, y, z)]
  dt = dt[ , list(xProp=(x / sum(x)), yScale=(y - mean(y)) / sd(y), z)]
  dt = dt[ , list(meanXProp=mean(xProp), meanYScale=mean(yScale)), by=z]
  dt = dt[order(dt[["z"]]), ]
},
times=10L)

print(mbc)
	library(dplyr)
	library(data.table)
	library(microbenchmark)

	###########################################################################
	# FUNCTIONALITY #
	###########################################################################

	set.seed(pi)
	samp = data.frame(x=runif(1e7, 2, 4),
	y=rnorm(1e7, mean=runif(1), sd=runif(1, 1, 2)),
	z=letters[sample.int(26, 1e7, TRUE)],
	w=LETTERS[sample.int(26, 1e7, TRUE)],
	stringsAsFactors=FALSE)

	# The process:
	# 1. Filter the samp on the first half of `letters' and the second half of
	# LETTERS and the second half of LETTERS.
	# 2. Select only columns x, y, and z out of the data.frame.
	# 3. Create two new columns:
	# 3.1 xProp = x / sum(x)
	# 3.2 yScale = (y - mean(y)) / sd(y)
	# 4. Calculate mean(xProp) and mean(yScale) by z.
	# 5. Arrange / Order output by letters.

	############
	# Base R #
	############

	baseR = samp[samp[["z"]] %in% letters[1:20] &
	samp[["w"]] %in% LETTERS[7:26],
	c("x", "y", "z")]
	baseR[["xProp"]] = with(baseR, x / sum(x))
	baseR[["yScale"]] = with(baseR, (y - mean(y)) / sd(y))
	baseR = by(baseR, baseR[["z"]],
	function (x)
	c("meanXProp"=mean(x[["xProp"]]),
	"meanYScale"=mean(x[["yScale"]])))
	baseR = do.call("rbind", baseR)

	###########
	# dplyr #
	###########

	dply =
	samp %.%
	filter(z %in% letters[1:20], w %in% LETTERS[7:26]) %.%
	select(x, y, z) %.%
	mutate(xProp=x / sum(x),
	yScale=(y - mean(y)) / sd(y)) %.%
	group_by(z) %.%
	summarise(meanXProp=mean(xProp), meanYScale=mean(yScale)) %.%
	arrange(z)

	################
	# data.table #
	################

	dt = data.table(samp)
	dt = dt[z %in% letters[1:20] & w %in% LETTERS[7:26], list(x, y, z)]
	dt = dt[ , list(xProp=(x / sum(x)), yScale=(y - mean(y)) / sd(y), z)]
	dt = dt[ , list(meanXProp=mean(xProp), meanYScale=mean(yScale)), by=z]
	dt = dt[order(dt[["z"]]), ]

	###########################################################################
	# BENCHMARKING #
	###########################################################################

	set.seed(pi)
	samp = data.frame(x=runif(1e7, 2, 4),
	y=rnorm(1e7, mean=runif(1), sd=runif(1, 1, 2)),
	z=letters[sample.int(26, 1e7, TRUE)],
	w=LETTERS[sample.int(26, 1e7, TRUE)],
	stringsAsFactors=FALSE)
	dtSamp = data.table(samp)

	mbc = microbenchmark({
	baseR = samp[samp[["z"]] %in% letters[1:20] &
	samp[["w"]] %in% LETTERS[7:26],
	c("x", "y", "z")]
	baseR[["xProp"]] = with(baseR, x / sum(x))
	baseR[["yScale"]] = with(baseR, (y - mean(y)) / sd(y))
	baseR = by(baseR, baseR[["z"]],
	function (x)
	c("meanXProp"=mean(x[["xProp"]]),
	"meanYScale"=mean(x[["yScale"]])))
	baseR = do.call("rbind", baseR)
	},
	{
	dply =
	samp %.%
	filter(z %in% letters[1:20], w %in% LETTERS[7:26]) %.%
	select(x, y, z) %.%
	mutate(xProp=x / sum(x),
	yScale=(y - mean(y)) / sd(y)) %.%
	group_by(z) %.%
	summarise(meanXProp=mean(xProp), meanYScale=mean(yScale)) %.%
	arrange(z)
	},
	{
	dt = dtSamp[z %in% letters[1:20] & w %in% LETTERS[7:26], list(x, y, z)]
	dt = dt[ , list(xProp=(x / sum(x)), yScale=(y - mean(y)) / sd(y), z)]
	dt = dt[ , list(meanXProp=mean(xProp), meanYScale=mean(yScale)), by=z]
	dt = dt[order(dt[["z"]]), ]
	},
	times=10L)

	print(mbc)