mrdwab/dummy_bench.R

## dummy_bench.R
library(microbenchmark)


## Change "n" to experiment with different sized `data.frame`s
set.seed(1)
n = 100000

## I couldn't think of other constants off the top of my head
## This should give us 26+26+50+50 "dummy variable" columns.
example <- data.frame(strcol = sample(
  c(LETTERS, letters, state.abb, state.name), n, replace = TRUE))

## This is the existing solution posted at
## http://randyzwitch.com/creating-dummy-variables-data-frame-r/
## Although `ifelse` is vectorized, I believe it checks things twice
##   so it will not be a very efficient approach here.
##   Also, the `data.frame` "grows" by a column on each iteration,
##   and in R, that is never a good coding practice if you are
##   concerned with efficiency.
fun1 <- function() {
  for(level in unique(example$strcol)){
    example[paste("dummy", level, sep = "_")] <-
      ifelse(example$strcol == level, 1, 0)
  }
  example
}


## This is the `model.matrix` approach. Very nice syntax,
##   and very nice performance!
fun2 <- function() {
  cbind(example, model.matrix(~example$strcol-1))
}


## This is a very manual approach that involves "matrix indexing".
## `matrix` operations in R are generally quite fast--much faster
##   than the same operations on `data.frame`s.
## An advantage with this approach is that it is easily adaptable
##   to creating other binary matrices with data starting in
##   different forms. See: http://stackoverflow.com/a/20689146/1270695
fun3 <- function() {
  A <- as.character(example$strcol)
  levs <- sort(unique(A))
  Cols <- match(A, levs)
  m <- matrix(0, nrow = nrow(example), ncol = length(levs),
              dimnames = list(NULL, paste("dummy", levs, sep = "_")))
  m[cbind(sequence(nrow(example)), Cols)] <- 1L
  cbind(example, m)
}


## `table` is sometimes called, slow, but most of the slowdown in
##   this approach here is because of the conversion to a `data.frame`
##   before `cbind`ing with the original data.
fun4 <- function() {
  cbind(example,
        as.data.frame.matrix(table(sequence(nrow(example)),
                                   example$strcol)))
}

system.time(fun1())
#    user  system elapsed
#  40.236   1.091  43.841

system.time(fun2())
#    user  system elapsed
#   2.057   0.227   2.361

system.time(fun3())
#    user  system elapsed
#   0.843   0.264   1.153

system.time(fun4())
#    user  system elapsed
#   1.169   0.000   1.196

microbenchmark(fun2(), fun3(), fun4(), times = 20)
# Unit: seconds
#    expr      min       lq   median       uq      max neval
#  fun2() 1.488400 1.507040 1.564745 1.664441 1.844389    20
#  fun3() 1.051614 1.091522 1.124513 1.249441 1.325493    20
#  fun4() 1.236552 1.296618 1.311441 1.350969 1.448980    20
	library(microbenchmark)


	## Change "n" to experiment with different sized `data.frame`s
	set.seed(1)
	n = 100000

	## I couldn't think of other constants off the top of my head
	## This should give us 26+26+50+50 "dummy variable" columns.
	example <- data.frame(strcol = sample(
	c(LETTERS, letters, state.abb, state.name), n, replace = TRUE))

	## This is the existing solution posted at
	## http://randyzwitch.com/creating-dummy-variables-data-frame-r/
	## Although `ifelse` is vectorized, I believe it checks things twice
	## so it will not be a very efficient approach here.
	## Also, the `data.frame` "grows" by a column on each iteration,
	## and in R, that is never a good coding practice if you are
	## concerned with efficiency.
	fun1 <- function() {
	for(level in unique(example$strcol)){
	example[paste("dummy", level, sep = "_")] <-
	ifelse(example$strcol == level, 1, 0)
	}
	example
	}


	## This is the `model.matrix` approach. Very nice syntax,
	## and very nice performance!
	fun2 <- function() {
	cbind(example, model.matrix(~example$strcol-1))
	}


	## This is a very manual approach that involves "matrix indexing".
	## `matrix` operations in R are generally quite fast--much faster
	## than the same operations on `data.frame`s.
	## An advantage with this approach is that it is easily adaptable
	## to creating other binary matrices with data starting in
	## different forms. See: http://stackoverflow.com/a/20689146/1270695
	fun3 <- function() {
	A <- as.character(example$strcol)
	levs <- sort(unique(A))
	Cols <- match(A, levs)
	m <- matrix(0, nrow = nrow(example), ncol = length(levs),
	dimnames = list(NULL, paste("dummy", levs, sep = "_")))
	m[cbind(sequence(nrow(example)), Cols)] <- 1L
	cbind(example, m)
	}


	## `table` is sometimes called, slow, but most of the slowdown in
	## this approach here is because of the conversion to a `data.frame`
	## before `cbind`ing with the original data.
	fun4 <- function() {
	cbind(example,
	as.data.frame.matrix(table(sequence(nrow(example)),
	example$strcol)))
	}

	system.time(fun1())
	# user system elapsed
	# 40.236 1.091 43.841

	system.time(fun2())
	# user system elapsed
	# 2.057 0.227 2.361

	system.time(fun3())
	# user system elapsed
	# 0.843 0.264 1.153

	system.time(fun4())
	# user system elapsed
	# 1.169 0.000 1.196

	microbenchmark(fun2(), fun3(), fun4(), times = 20)
	# Unit: seconds
	# expr min lq median uq max neval
	# fun2() 1.488400 1.507040 1.564745 1.664441 1.844389 20
	# fun3() 1.051614 1.091522 1.124513 1.249441 1.325493 20
	# fun4() 1.236552 1.296618 1.311441 1.350969 1.448980 20