Skip to content

Instantly share code, notes, and snippets.

@mrdwab
Last active August 29, 2015 14:05
Show Gist options
  • Save mrdwab/507c5f31ffe97e19faa2 to your computer and use it in GitHub Desktop.
Save mrdwab/507c5f31ffe97e19faa2 to your computer and use it in GitHub Desktop.
df <- structure(list(id = 1:11, date = c("2013-01-08", "2013-01-09",
"2013-01-10", "2013-01-11", "2013-01-12", "2013-01-13", "2013-01-14",
"2013-01-15", "2013-01-16", "2013-01-17", "2013-01-18"), Events = structure(c(7L,
2L, 1L, 3L, 10L, 8L, 9L, 11L, 4L, 6L, 5L), .Label = c("", "Fog",
"Fog-Rain", "Fog-Rain-Thunderstorm", "Fog-Rain-Thunderstorm-Snow",
"Fog-Thunderstorm", "Rain", "Rain-Snow", "Rain-Thunderstorm",
"Snow", "Thunderstorm"), class = "factor")), .Names = c("id",
"date", "Events"), row.names = c(NA, -11L), class = "data.frame")
df
# id date Events
# 1 1 2013-01-08 Rain
# 2 2 2013-01-09 Fog
# 3 3 2013-01-10
# 4 4 2013-01-11 Fog-Rain
# 5 5 2013-01-12 Snow
# 6 6 2013-01-13 Rain-Snow
# 7 7 2013-01-14 Rain-Thunderstorm
# 8 8 2013-01-15 Thunderstorm
# 9 9 2013-01-16 Fog-Rain-Thunderstorm
# 10 10 2013-01-17 Fog-Thunderstorm
# 11 11 2013-01-18 Fog-Rain-Thunderstorm-Snow
fun1 <- function(indf) {
concat.split.expanded(indf, "Events", "-", type = "character",
fill = 0, drop = TRUE)
}
fun1b <- function(indf) {
cbind(indf[, c("id", "date")],
splitstackshape:::charMat(
strsplit(as.character(indf[, "Events"]), "-", fixed = TRUE),
fill = 0))
}
fun2 <- function(indf) {
lst <- strsplit(as.character(indf[, "Events"]), "-", fixed = TRUE)
lvl <- unique(unlist(lst))
data.frame(date=indf[, "date"],
do.call(rbind,lapply(lst, function(x)
table(factor(x, levels=lvl)))),
stringsAsFactors=FALSE)
}
fun2b <- function(indf) {
lst <- strsplit(as.character(indf[, "Events"]), "-", fixed = TRUE)
lvl <- unique(unlist(lst))
setNames(
data.frame(indf[, c("id", "date")],
do.call(rbind, lapply(lst, function(x)
as.integer(lvl %in% x)) )), c("id", "date", lvl))
}
fun3 <- function(indt) {
res1 <- dcast.data.table(
cSplit(indt, "Events", "-", "long"), id + date ~ Events,
value.var = "Events")
res2 <- merge(subset(indt, select = c("id", "date")),
res1, by = c("id", "date"), all = TRUE)
res2 <- as.data.frame(res2)
res2[, -c(1, 2)] <- (!is.na(res2[, -c(1, 2)])) + 0
res2
}
fun3b <- function(indt) {
ids <- c("id", "date")
res1 <- merge(
dcast.data.table(
cSplit(indt, "Events", "-", "long"), id + date ~ Events,
value.var = "Events"),
indt[, ids, with = FALSE], all = TRUE)
cbind(res1[, ids, with = FALSE],
(!is.na(res1[, Cols, with = FALSE])) + 0)
}
fun4 <- function(indf) {
cbind(indf[c("id", "date")],
mtabulate(strsplit(as.character(indf[, "Events"]),
"-", fixed = TRUE)))
}
fun5 <- function(indf) {
ddf <- data.frame(id = indf[, "id"], date = indf[, "date"],
Rain = 0, Fog = 0, Snow = 0, Thunderstorm = 0)
for(i in 3:6) ddf[grep(names(ddf)[i], indf[, 3]), i] = 1
ddf
}
df10k <- do.call(rbind, replicate(910, df, FALSE))
df10k$id <- 1:nrow(df10k)
dt10k <- as.data.table(df10k)
library(microbenchmark)
microbenchmark(fun1(df10k), fun1b(df10k),
fun2(df10k), fun2b(df10k),
fun3(dt10k), fun3b(dt10k),
fun4(df10k), fun5(df10k),
times = 10)
# Unit: milliseconds
# expr min lq median uq max neval
# fun1(df10k) 586.15365 590.24610 598.26167 608.44320 681.25938 10
# fun1b(df10k) 31.10052 32.70029 33.42214 34.05589 35.88232 10
# fun2(df10k) 3549.38998 3610.45949 3885.59388 3998.80305 4083.12558 10
# fun2b(df10k) 96.99245 99.50138 107.32045 115.07238 124.64923 10
# fun3(dt10k) 69.97883 71.37335 72.42597 76.87956 133.06592 10
# fun3b(dt10k) 67.05681 69.75133 74.55515 82.37037 93.77665 10
# fun4(df10k) 521.32609 535.79038 569.82296 610.18880 711.66799 10
# fun5(df10k) 25.15641 25.19575 25.53700 26.11388 44.86690 10
df1M <- do.call(rbind, replicate(100, df10k, FALSE))
df1M$id <- 1:nrow(df1M)
dt1M <- as.data.table(df1M)
microbenchmark(fun1b(df1M), fun2b(df1M), fun3(dt1M),
fun3b(dt1M), fun5(df1M), times = 5)
# Unit: seconds
# expr min lq median uq max neval
# fun1b(df1M) 2.115184 2.286237 2.387344 2.529347 2.805951 5
# fun2b(df1M) 15.684816 26.414540 26.544937 27.491509 27.709156 5
# fun3(dt1M) 7.246204 7.266766 8.166459 8.388421 9.097704 5
# fun3b(dt1M) 7.398405 7.421722 7.674666 7.740538 7.780481 5
# fun5(df1M) 2.395482 2.413317 2.425882 2.468739 2.611830 5
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment