Last active
August 29, 2015 14:05
-
-
Save mrdwab/507c5f31ffe97e19faa2 to your computer and use it in GitHub Desktop.
Benchmarking options for http://stackoverflow.com/q/25347739/1270695
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df <- structure(list(id = 1:11, date = c("2013-01-08", "2013-01-09", | |
"2013-01-10", "2013-01-11", "2013-01-12", "2013-01-13", "2013-01-14", | |
"2013-01-15", "2013-01-16", "2013-01-17", "2013-01-18"), Events = structure(c(7L, | |
2L, 1L, 3L, 10L, 8L, 9L, 11L, 4L, 6L, 5L), .Label = c("", "Fog", | |
"Fog-Rain", "Fog-Rain-Thunderstorm", "Fog-Rain-Thunderstorm-Snow", | |
"Fog-Thunderstorm", "Rain", "Rain-Snow", "Rain-Thunderstorm", | |
"Snow", "Thunderstorm"), class = "factor")), .Names = c("id", | |
"date", "Events"), row.names = c(NA, -11L), class = "data.frame") | |
df | |
# id date Events | |
# 1 1 2013-01-08 Rain | |
# 2 2 2013-01-09 Fog | |
# 3 3 2013-01-10 | |
# 4 4 2013-01-11 Fog-Rain | |
# 5 5 2013-01-12 Snow | |
# 6 6 2013-01-13 Rain-Snow | |
# 7 7 2013-01-14 Rain-Thunderstorm | |
# 8 8 2013-01-15 Thunderstorm | |
# 9 9 2013-01-16 Fog-Rain-Thunderstorm | |
# 10 10 2013-01-17 Fog-Thunderstorm | |
# 11 11 2013-01-18 Fog-Rain-Thunderstorm-Snow | |
fun1 <- function(indf) { | |
concat.split.expanded(indf, "Events", "-", type = "character", | |
fill = 0, drop = TRUE) | |
} | |
fun1b <- function(indf) { | |
cbind(indf[, c("id", "date")], | |
splitstackshape:::charMat( | |
strsplit(as.character(indf[, "Events"]), "-", fixed = TRUE), | |
fill = 0)) | |
} | |
fun2 <- function(indf) { | |
lst <- strsplit(as.character(indf[, "Events"]), "-", fixed = TRUE) | |
lvl <- unique(unlist(lst)) | |
data.frame(date=indf[, "date"], | |
do.call(rbind,lapply(lst, function(x) | |
table(factor(x, levels=lvl)))), | |
stringsAsFactors=FALSE) | |
} | |
fun2b <- function(indf) { | |
lst <- strsplit(as.character(indf[, "Events"]), "-", fixed = TRUE) | |
lvl <- unique(unlist(lst)) | |
setNames( | |
data.frame(indf[, c("id", "date")], | |
do.call(rbind, lapply(lst, function(x) | |
as.integer(lvl %in% x)) )), c("id", "date", lvl)) | |
} | |
fun3 <- function(indt) { | |
res1 <- dcast.data.table( | |
cSplit(indt, "Events", "-", "long"), id + date ~ Events, | |
value.var = "Events") | |
res2 <- merge(subset(indt, select = c("id", "date")), | |
res1, by = c("id", "date"), all = TRUE) | |
res2 <- as.data.frame(res2) | |
res2[, -c(1, 2)] <- (!is.na(res2[, -c(1, 2)])) + 0 | |
res2 | |
} | |
fun3b <- function(indt) { | |
ids <- c("id", "date") | |
res1 <- merge( | |
dcast.data.table( | |
cSplit(indt, "Events", "-", "long"), id + date ~ Events, | |
value.var = "Events"), | |
indt[, ids, with = FALSE], all = TRUE) | |
cbind(res1[, ids, with = FALSE], | |
(!is.na(res1[, Cols, with = FALSE])) + 0) | |
} | |
fun4 <- function(indf) { | |
cbind(indf[c("id", "date")], | |
mtabulate(strsplit(as.character(indf[, "Events"]), | |
"-", fixed = TRUE))) | |
} | |
fun5 <- function(indf) { | |
ddf <- data.frame(id = indf[, "id"], date = indf[, "date"], | |
Rain = 0, Fog = 0, Snow = 0, Thunderstorm = 0) | |
for(i in 3:6) ddf[grep(names(ddf)[i], indf[, 3]), i] = 1 | |
ddf | |
} | |
df10k <- do.call(rbind, replicate(910, df, FALSE)) | |
df10k$id <- 1:nrow(df10k) | |
dt10k <- as.data.table(df10k) | |
library(microbenchmark) | |
microbenchmark(fun1(df10k), fun1b(df10k), | |
fun2(df10k), fun2b(df10k), | |
fun3(dt10k), fun3b(dt10k), | |
fun4(df10k), fun5(df10k), | |
times = 10) | |
# Unit: milliseconds | |
# expr min lq median uq max neval | |
# fun1(df10k) 586.15365 590.24610 598.26167 608.44320 681.25938 10 | |
# fun1b(df10k) 31.10052 32.70029 33.42214 34.05589 35.88232 10 | |
# fun2(df10k) 3549.38998 3610.45949 3885.59388 3998.80305 4083.12558 10 | |
# fun2b(df10k) 96.99245 99.50138 107.32045 115.07238 124.64923 10 | |
# fun3(dt10k) 69.97883 71.37335 72.42597 76.87956 133.06592 10 | |
# fun3b(dt10k) 67.05681 69.75133 74.55515 82.37037 93.77665 10 | |
# fun4(df10k) 521.32609 535.79038 569.82296 610.18880 711.66799 10 | |
# fun5(df10k) 25.15641 25.19575 25.53700 26.11388 44.86690 10 | |
df1M <- do.call(rbind, replicate(100, df10k, FALSE)) | |
df1M$id <- 1:nrow(df1M) | |
dt1M <- as.data.table(df1M) | |
microbenchmark(fun1b(df1M), fun2b(df1M), fun3(dt1M), | |
fun3b(dt1M), fun5(df1M), times = 5) | |
# Unit: seconds | |
# expr min lq median uq max neval | |
# fun1b(df1M) 2.115184 2.286237 2.387344 2.529347 2.805951 5 | |
# fun2b(df1M) 15.684816 26.414540 26.544937 27.491509 27.709156 5 | |
# fun3(dt1M) 7.246204 7.266766 8.166459 8.388421 9.097704 5 | |
# fun3b(dt1M) 7.398405 7.421722 7.674666 7.740538 7.780481 5 | |
# fun5(df1M) 2.395482 2.413317 2.425882 2.468739 2.611830 5 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment