mrdwab/SO25347739.R

## SO25347739.R
df <- structure(list(id = 1:11, date = c("2013-01-08", "2013-01-09",
"2013-01-10", "2013-01-11", "2013-01-12", "2013-01-13", "2013-01-14",
"2013-01-15", "2013-01-16", "2013-01-17", "2013-01-18"), Events = structure(c(7L,
2L, 1L, 3L, 10L, 8L, 9L, 11L, 4L, 6L, 5L), .Label = c("", "Fog",
"Fog-Rain", "Fog-Rain-Thunderstorm", "Fog-Rain-Thunderstorm-Snow",
"Fog-Thunderstorm", "Rain", "Rain-Snow", "Rain-Thunderstorm",
"Snow", "Thunderstorm"), class = "factor")), .Names = c("id",
"date", "Events"), row.names = c(NA, -11L), class = "data.frame")

df
#    id       date                     Events
# 1   1 2013-01-08                       Rain
# 2   2 2013-01-09                        Fog
# 3   3 2013-01-10
# 4   4 2013-01-11                   Fog-Rain
# 5   5 2013-01-12                       Snow
# 6   6 2013-01-13                  Rain-Snow
# 7   7 2013-01-14          Rain-Thunderstorm
# 8   8 2013-01-15               Thunderstorm
# 9   9 2013-01-16      Fog-Rain-Thunderstorm
# 10 10 2013-01-17           Fog-Thunderstorm
# 11 11 2013-01-18 Fog-Rain-Thunderstorm-Snow

fun1 <- function(indf) {
  concat.split.expanded(indf, "Events", "-", type = "character",
                        fill = 0, drop = TRUE)
}

fun1b <- function(indf) {
  cbind(indf[, c("id", "date")],
        splitstackshape:::charMat(
          strsplit(as.character(indf[, "Events"]), "-", fixed = TRUE),
          fill = 0))
}

fun2 <- function(indf) {
  lst <- strsplit(as.character(indf[, "Events"]), "-", fixed = TRUE)
  lvl <- unique(unlist(lst))
  data.frame(date=indf[, "date"],
             do.call(rbind,lapply(lst, function(x)
               table(factor(x, levels=lvl)))),
             stringsAsFactors=FALSE)
}

fun2b <- function(indf) {
  lst <- strsplit(as.character(indf[, "Events"]), "-", fixed = TRUE)
  lvl <- unique(unlist(lst))
  setNames(
    data.frame(indf[, c("id", "date")],
               do.call(rbind, lapply(lst, function(x)
                 as.integer(lvl %in% x)) )), c("id", "date", lvl))
}

fun3 <- function(indt) {
  res1 <- dcast.data.table(
    cSplit(indt, "Events", "-", "long"), id + date ~ Events,
    value.var = "Events")
  res2 <- merge(subset(indt, select = c("id", "date")),
                res1, by = c("id", "date"), all = TRUE)
  res2 <- as.data.frame(res2)
  res2[, -c(1, 2)] <- (!is.na(res2[, -c(1, 2)])) + 0
  res2
}

fun3b <- function(indt) {
  ids <- c("id", "date")
  res1 <- merge(
    dcast.data.table(
      cSplit(indt, "Events", "-", "long"), id + date ~ Events,
      value.var = "Events"),
    indt[, ids, with = FALSE], all = TRUE)
  cbind(res1[, ids, with = FALSE],
        (!is.na(res1[, Cols, with = FALSE])) + 0)
}

fun4 <- function(indf) {
  cbind(indf[c("id", "date")],
        mtabulate(strsplit(as.character(indf[, "Events"]),
                           "-", fixed = TRUE)))
}

fun5 <- function(indf) {
  ddf <- data.frame(id = indf[, "id"], date = indf[, "date"],
                    Rain = 0, Fog = 0, Snow = 0, Thunderstorm = 0)
  for(i in 3:6) ddf[grep(names(ddf)[i], indf[, 3]), i] = 1
  ddf
}

df10k <- do.call(rbind, replicate(910, df, FALSE))
df10k$id <- 1:nrow(df10k)
dt10k <- as.data.table(df10k)

library(microbenchmark)
microbenchmark(fun1(df10k), fun1b(df10k),
               fun2(df10k), fun2b(df10k),
               fun3(dt10k), fun3b(dt10k),
               fun4(df10k), fun5(df10k),
               times = 10)
# Unit: milliseconds
#          expr        min         lq     median         uq        max neval
#   fun1(df10k)  586.15365  590.24610  598.26167  608.44320  681.25938    10
#  fun1b(df10k)   31.10052   32.70029   33.42214   34.05589   35.88232    10
#   fun2(df10k) 3549.38998 3610.45949 3885.59388 3998.80305 4083.12558    10
#  fun2b(df10k)   96.99245   99.50138  107.32045  115.07238  124.64923    10
#   fun3(dt10k)   69.97883   71.37335   72.42597   76.87956  133.06592    10
#  fun3b(dt10k)   67.05681   69.75133   74.55515   82.37037   93.77665    10
#   fun4(df10k)  521.32609  535.79038  569.82296  610.18880  711.66799    10
#   fun5(df10k)   25.15641   25.19575   25.53700   26.11388   44.86690    10

df1M <- do.call(rbind, replicate(100, df10k, FALSE))
df1M$id <- 1:nrow(df1M)
dt1M <- as.data.table(df1M)

microbenchmark(fun1b(df1M), fun2b(df1M), fun3(dt1M),
               fun3b(dt1M), fun5(df1M), times = 5)
# Unit: seconds
#         expr       min        lq    median        uq       max neval
#  fun1b(df1M)  2.115184  2.286237  2.387344  2.529347  2.805951     5
#  fun2b(df1M) 15.684816 26.414540 26.544937 27.491509 27.709156     5
#   fun3(dt1M)  7.246204  7.266766  8.166459  8.388421  9.097704     5
#  fun3b(dt1M)  7.398405  7.421722  7.674666  7.740538  7.780481     5
#   fun5(df1M)  2.395482  2.413317  2.425882  2.468739  2.611830     5
	df <- structure(list(id = 1:11, date = c("2013-01-08", "2013-01-09",
	"2013-01-10", "2013-01-11", "2013-01-12", "2013-01-13", "2013-01-14",
	"2013-01-15", "2013-01-16", "2013-01-17", "2013-01-18"), Events = structure(c(7L,
	2L, 1L, 3L, 10L, 8L, 9L, 11L, 4L, 6L, 5L), .Label = c("", "Fog",
	"Fog-Rain", "Fog-Rain-Thunderstorm", "Fog-Rain-Thunderstorm-Snow",
	"Fog-Thunderstorm", "Rain", "Rain-Snow", "Rain-Thunderstorm",
	"Snow", "Thunderstorm"), class = "factor")), .Names = c("id",
	"date", "Events"), row.names = c(NA, -11L), class = "data.frame")

	df
	# id date Events
	# 1 1 2013-01-08 Rain
	# 2 2 2013-01-09 Fog
	# 3 3 2013-01-10
	# 4 4 2013-01-11 Fog-Rain
	# 5 5 2013-01-12 Snow
	# 6 6 2013-01-13 Rain-Snow
	# 7 7 2013-01-14 Rain-Thunderstorm
	# 8 8 2013-01-15 Thunderstorm
	# 9 9 2013-01-16 Fog-Rain-Thunderstorm
	# 10 10 2013-01-17 Fog-Thunderstorm
	# 11 11 2013-01-18 Fog-Rain-Thunderstorm-Snow

	fun1 <- function(indf) {
	concat.split.expanded(indf, "Events", "-", type = "character",
	fill = 0, drop = TRUE)
	}

	fun1b <- function(indf) {
	cbind(indf[, c("id", "date")],
	splitstackshape:::charMat(
	strsplit(as.character(indf[, "Events"]), "-", fixed = TRUE),
	fill = 0))
	}

	fun2 <- function(indf) {
	lst <- strsplit(as.character(indf[, "Events"]), "-", fixed = TRUE)
	lvl <- unique(unlist(lst))
	data.frame(date=indf[, "date"],
	do.call(rbind,lapply(lst, function(x)
	table(factor(x, levels=lvl)))),
	stringsAsFactors=FALSE)
	}

	fun2b <- function(indf) {
	lst <- strsplit(as.character(indf[, "Events"]), "-", fixed = TRUE)
	lvl <- unique(unlist(lst))
	setNames(
	data.frame(indf[, c("id", "date")],
	do.call(rbind, lapply(lst, function(x)
	as.integer(lvl %in% x)) )), c("id", "date", lvl))
	}

	fun3 <- function(indt) {
	res1 <- dcast.data.table(
	cSplit(indt, "Events", "-", "long"), id + date ~ Events,
	value.var = "Events")
	res2 <- merge(subset(indt, select = c("id", "date")),
	res1, by = c("id", "date"), all = TRUE)
	res2 <- as.data.frame(res2)
	res2[, -c(1, 2)] <- (!is.na(res2[, -c(1, 2)])) + 0
	res2
	}

	fun3b <- function(indt) {
	ids <- c("id", "date")
	res1 <- merge(
	dcast.data.table(
	cSplit(indt, "Events", "-", "long"), id + date ~ Events,
	value.var = "Events"),
	indt[, ids, with = FALSE], all = TRUE)
	cbind(res1[, ids, with = FALSE],
	(!is.na(res1[, Cols, with = FALSE])) + 0)
	}

	fun4 <- function(indf) {
	cbind(indf[c("id", "date")],
	mtabulate(strsplit(as.character(indf[, "Events"]),
	"-", fixed = TRUE)))
	}

	fun5 <- function(indf) {
	ddf <- data.frame(id = indf[, "id"], date = indf[, "date"],
	Rain = 0, Fog = 0, Snow = 0, Thunderstorm = 0)
	for(i in 3:6) ddf[grep(names(ddf)[i], indf[, 3]), i] = 1
	ddf
	}

	df10k <- do.call(rbind, replicate(910, df, FALSE))
	df10k$id <- 1:nrow(df10k)
	dt10k <- as.data.table(df10k)

	library(microbenchmark)
	microbenchmark(fun1(df10k), fun1b(df10k),
	fun2(df10k), fun2b(df10k),
	fun3(dt10k), fun3b(dt10k),
	fun4(df10k), fun5(df10k),
	times = 10)
	# Unit: milliseconds
	# expr min lq median uq max neval
	# fun1(df10k) 586.15365 590.24610 598.26167 608.44320 681.25938 10
	# fun1b(df10k) 31.10052 32.70029 33.42214 34.05589 35.88232 10
	# fun2(df10k) 3549.38998 3610.45949 3885.59388 3998.80305 4083.12558 10
	# fun2b(df10k) 96.99245 99.50138 107.32045 115.07238 124.64923 10
	# fun3(dt10k) 69.97883 71.37335 72.42597 76.87956 133.06592 10
	# fun3b(dt10k) 67.05681 69.75133 74.55515 82.37037 93.77665 10
	# fun4(df10k) 521.32609 535.79038 569.82296 610.18880 711.66799 10
	# fun5(df10k) 25.15641 25.19575 25.53700 26.11388 44.86690 10

	df1M <- do.call(rbind, replicate(100, df10k, FALSE))
	df1M$id <- 1:nrow(df1M)
	dt1M <- as.data.table(df1M)

	microbenchmark(fun1b(df1M), fun2b(df1M), fun3(dt1M),
	fun3b(dt1M), fun5(df1M), times = 5)
	# Unit: seconds
	# expr min lq median uq max neval
	# fun1b(df1M) 2.115184 2.286237 2.387344 2.529347 2.805951 5
	# fun2b(df1M) 15.684816 26.414540 26.544937 27.491509 27.709156 5
	# fun3(dt1M) 7.246204 7.266766 8.166459 8.388421 9.097704 5
	# fun3b(dt1M) 7.398405 7.421722 7.674666 7.740538 7.780481 5
	# fun5(df1M) 2.395482 2.413317 2.425882 2.468739 2.611830 5