Last active
January 2, 2016 03:19
-
-
Save mrdwab/8242632 to your computer and use it in GitHub Desktop.
Results of a benchmark for creating dummy variables from a single column.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(microbenchmark) | |
## Change "n" to experiment with different sized `data.frame`s | |
set.seed(1) | |
n = 100000 | |
## I couldn't think of other constants off the top of my head | |
## This should give us 26+26+50+50 "dummy variable" columns. | |
example <- data.frame(strcol = sample( | |
c(LETTERS, letters, state.abb, state.name), n, replace = TRUE)) | |
## This is the existing solution posted at | |
## http://randyzwitch.com/creating-dummy-variables-data-frame-r/ | |
## Although `ifelse` is vectorized, I believe it checks things twice | |
## so it will not be a very efficient approach here. | |
## Also, the `data.frame` "grows" by a column on each iteration, | |
## and in R, that is never a good coding practice if you are | |
## concerned with efficiency. | |
fun1 <- function() { | |
for(level in unique(example$strcol)){ | |
example[paste("dummy", level, sep = "_")] <- | |
ifelse(example$strcol == level, 1, 0) | |
} | |
example | |
} | |
## This is the `model.matrix` approach. Very nice syntax, | |
## and very nice performance! | |
fun2 <- function() { | |
cbind(example, model.matrix(~example$strcol-1)) | |
} | |
## This is a very manual approach that involves "matrix indexing". | |
## `matrix` operations in R are generally quite fast--much faster | |
## than the same operations on `data.frame`s. | |
## An advantage with this approach is that it is easily adaptable | |
## to creating other binary matrices with data starting in | |
## different forms. See: http://stackoverflow.com/a/20689146/1270695 | |
fun3 <- function() { | |
A <- as.character(example$strcol) | |
levs <- sort(unique(A)) | |
Cols <- match(A, levs) | |
m <- matrix(0, nrow = nrow(example), ncol = length(levs), | |
dimnames = list(NULL, paste("dummy", levs, sep = "_"))) | |
m[cbind(sequence(nrow(example)), Cols)] <- 1L | |
cbind(example, m) | |
} | |
## `table` is sometimes called, slow, but most of the slowdown in | |
## this approach here is because of the conversion to a `data.frame` | |
## before `cbind`ing with the original data. | |
fun4 <- function() { | |
cbind(example, | |
as.data.frame.matrix(table(sequence(nrow(example)), | |
example$strcol))) | |
} | |
system.time(fun1()) | |
# user system elapsed | |
# 40.236 1.091 43.841 | |
system.time(fun2()) | |
# user system elapsed | |
# 2.057 0.227 2.361 | |
system.time(fun3()) | |
# user system elapsed | |
# 0.843 0.264 1.153 | |
system.time(fun4()) | |
# user system elapsed | |
# 1.169 0.000 1.196 | |
microbenchmark(fun2(), fun3(), fun4(), times = 20) | |
# Unit: seconds | |
# expr min lq median uq max neval | |
# fun2() 1.488400 1.507040 1.564745 1.664441 1.844389 20 | |
# fun3() 1.051614 1.091522 1.124513 1.249441 1.325493 20 | |
# fun4() 1.236552 1.296618 1.311441 1.350969 1.448980 20 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment