arunsrinivasan/CologneR.R

## CologneR.R
require(reshape2)

# data.table commit (1048)
require(data.table)
# Loading required package: data.table
# data.table 1.8.11  For help type: help("data.table")

set.seed(1)
N <- 2e7 # size of DT

# generate a character vector of length about 1e5
foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
ch <- replicate(1e5, foo())
ch <- unique(ch)

# > length(ch)
# [1] 99982

# DT now
DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)),
                 b = as.numeric(sample(rnorm(1e6), N, TRUE)),
                 c = sample(c(NA_integer_, 1e5:1e6), N, TRUE),
                 d = sample(ch, N, TRUE))

tables()
#      NAME       NROW  MB COLS    KEY
# [1,] DT   20,000,000 539 a,b,c,d
# Total: 539MB

# timing (run 3 times) - function borrowed from Hadley.
benchmark <- function(code) {
  code <- substitute(code)

  rbind(
    system.time(eval(code, parent.frame())),
    system.time(eval(code, parent.frame())),
    system.time(eval(code, parent.frame()))
  )
}

# r-session memory usage - 1GB

## MELT
## ----

benchmark(melt(DT, id="d", measure=1:2))
     user.self sys.self elapsed user.child sys.child
[1,]     2.885    0.595   3.554          0         0
[2,]     1.898    0.579   2.516          0         0
[3,]     1.894    0.562   2.492          0         0

# compare against reshape2
benchmark(reshape2:::melt.data.frame(DT, id="d", measure=1:2))
#      user.self sys.self elapsed user.child sys.child
# [1,]   227.841   11.725 304.533          0         0
# [2,]   166.293   10.032 190.056          0         0
# [3,]   170.237   10.364 195.621          0         0

######################################################################
# max memory used (mem footprint) - DT (1.7GB) vs reshape2 (4.7GB)
# Speedup of ~ 75x!!!!
######################################################################

## CASTING
## -------
# add a new column (showcase new feature in v1.8.11 for 'set')
smple <- sample(letters[1:10], 2e7, TRUE)
system.time(set(DT, i=NULL, j="e", value=smple)) # new feature in 1.8.11 - adding new column using set
#   user  system elapsed
#  0.108   0.116   0.240

benchmark(dcast.data.table(DT, d ~ e, value.var="b", fun=sum))
#      user.self sys.self elapsed user.child sys.child
# [1,]    14.253    1.011  15.953          0         0
# [2,]    14.149    1.002  15.648          0         0
# [3,]    14.256    1.031  15.934          0         0

#############################################################################################################
# NOTE: We can't run this on current `reshape2` as it segfaults - because of function `split-numerics.cpp`
# in 'plyr'. I made the changes locally (from Hadley's email) and tested 'reshape2'. Here's the benchmarking
# result. I ran it only once as it takes quite a long time.
#############################################################################################################

# If you're interested in trying this, go here: http://gallery.rcpp.org/articles/plyr-c-to-rcpp/
# Copy the C++ code and replace the function in src/split-numerics.c of "plyr"
# You may also have to modify `plyr_split_indices` from RcppExports.cpp as well:
# here's the code: https://github.com/hadley/dplyr/blob/master/src/RcppExports.cpp
# Replace "dplyr" accordingly. Then compile the package.

system.time(out1 <- dcast(DT, d ~ e, value.var="b", fun=sum))
#   user  system elapsed
# 41.697   4.941  46.887

out2 <- dcast.data.table(DT, d ~ e, value.var="b", fun=sum)
out2.df <- as.data.frame(out2)
setnames(out2.df, names(out1)) # set names same as out1
identical(out1, out2.df) # [1] TRUE

###############################################################
# max memory usage: 1.8GB (data.table) vs 1.8GB (reshape2)
# dcast.data.table is faster than dcast by ~ > 3x times here.
###############################################################

## more columns on the LHS
set(DT, i=NULL, j="f", value=sample(10, 2e7, TRUE))

system.time(out1 <- dcast(DT, f+d ~ e, value.var="b", fun=length))
#    user  system elapsed
# 174.912   8.143 184.422

system.time(out2 <- dcast.data.table(DT, f+d ~ e, value.var="b", fun=length))
#   user  system elapsed
# 25.253   2.249  28.430

out2.df <- as.data.frame(out2)
setnames(out2.df, names(out1)) # set names same as out1
identical(out1, out2.df) # [1] TRUE

###############################################################
# max memory usage: 2.98GB (data.table) vs 3.38GB (reshape2)
# dcast.data.table is faster than dcast by ~ > 6.5x times here.
###############################################################

# other relevant benchmarks:
# https://gist.github.com/arunsrinivasan/7836512 - benchmarks on (mostly) "setkey" for 1.8.11
# https://gist.github.com/arunsrinivasan/7832436 - benchmarks on (mostly) "setkey" for 1.8.10
# https://gist.github.com/arunsrinivasan/7839002 - dplyr vs data.table 1.8.11 commit 1048
	require(reshape2)

	# data.table commit (1048)
	require(data.table)
	# Loading required package: data.table
	# data.table 1.8.11 For help type: help("data.table")

	set.seed(1)
	N <- 2e7 # size of DT

	# generate a character vector of length about 1e5
	foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
	ch <- replicate(1e5, foo())
	ch <- unique(ch)

	# > length(ch)
	# [1] 99982

	# DT now
	DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)),
	b = as.numeric(sample(rnorm(1e6), N, TRUE)),
	c = sample(c(NA_integer_, 1e5:1e6), N, TRUE),
	d = sample(ch, N, TRUE))

	tables()
	# NAME NROW MB COLS KEY
	# [1,] DT 20,000,000 539 a,b,c,d
	# Total: 539MB

	# timing (run 3 times) - function borrowed from Hadley.
	benchmark <- function(code) {
	code <- substitute(code)

	rbind(
	system.time(eval(code, parent.frame())),
	system.time(eval(code, parent.frame())),
	system.time(eval(code, parent.frame()))
	)
	}

	# r-session memory usage - 1GB

	## MELT
	## ----

	benchmark(melt(DT, id="d", measure=1:2))
	user.self sys.self elapsed user.child sys.child
	[1,] 2.885 0.595 3.554 0 0
	[2,] 1.898 0.579 2.516 0 0
	[3,] 1.894 0.562 2.492 0 0

	# compare against reshape2
	benchmark(reshape2:::melt.data.frame(DT, id="d", measure=1:2))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 227.841 11.725 304.533 0 0
	# [2,] 166.293 10.032 190.056 0 0
	# [3,] 170.237 10.364 195.621 0 0

	######################################################################
	# max memory used (mem footprint) - DT (1.7GB) vs reshape2 (4.7GB)
	# Speedup of ~ 75x!!!!
	######################################################################

	## CASTING
	## -------
	# add a new column (showcase new feature in v1.8.11 for 'set')
	smple <- sample(letters[1:10], 2e7, TRUE)
	system.time(set(DT, i=NULL, j="e", value=smple)) # new feature in 1.8.11 - adding new column using set
	# user system elapsed
	# 0.108 0.116 0.240

	benchmark(dcast.data.table(DT, d ~ e, value.var="b", fun=sum))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 14.253 1.011 15.953 0 0
	# [2,] 14.149 1.002 15.648 0 0
	# [3,] 14.256 1.031 15.934 0 0

	#############################################################################################################
	# NOTE: We can't run this on current `reshape2` as it segfaults - because of function `split-numerics.cpp`
	# in 'plyr'. I made the changes locally (from Hadley's email) and tested 'reshape2'. Here's the benchmarking
	# result. I ran it only once as it takes quite a long time.
	#############################################################################################################

	# If you're interested in trying this, go here: http://gallery.rcpp.org/articles/plyr-c-to-rcpp/
	# Copy the C++ code and replace the function in src/split-numerics.c of "plyr"
	# You may also have to modify `plyr_split_indices` from RcppExports.cpp as well:
	# here's the code: https://github.com/hadley/dplyr/blob/master/src/RcppExports.cpp
	# Replace "dplyr" accordingly. Then compile the package.

	system.time(out1 <- dcast(DT, d ~ e, value.var="b", fun=sum))
	# user system elapsed
	# 41.697 4.941 46.887

	out2 <- dcast.data.table(DT, d ~ e, value.var="b", fun=sum)
	out2.df <- as.data.frame(out2)
	setnames(out2.df, names(out1)) # set names same as out1
	identical(out1, out2.df) # [1] TRUE

	###############################################################
	# max memory usage: 1.8GB (data.table) vs 1.8GB (reshape2)
	# dcast.data.table is faster than dcast by ~ > 3x times here.
	###############################################################

	## more columns on the LHS
	set(DT, i=NULL, j="f", value=sample(10, 2e7, TRUE))

	system.time(out1 <- dcast(DT, f+d ~ e, value.var="b", fun=length))
	# user system elapsed
	# 174.912 8.143 184.422

	system.time(out2 <- dcast.data.table(DT, f+d ~ e, value.var="b", fun=length))
	# user system elapsed
	# 25.253 2.249 28.430

	out2.df <- as.data.frame(out2)
	setnames(out2.df, names(out1)) # set names same as out1
	identical(out1, out2.df) # [1] TRUE

	###############################################################
	# max memory usage: 2.98GB (data.table) vs 3.38GB (reshape2)
	# dcast.data.table is faster than dcast by ~ > 6.5x times here.
	###############################################################

	# other relevant benchmarks:
	# https://gist.github.com/arunsrinivasan/7836512 - benchmarks on (mostly) "setkey" for 1.8.11
	# https://gist.github.com/arunsrinivasan/7832436 - benchmarks on (mostly) "setkey" for 1.8.10
	# https://gist.github.com/arunsrinivasan/7839002 - dplyr vs data.table 1.8.11 commit 1048