ptoche/benchmark-diffs.R

## benchmark-diffs.R
# benchmark computing time differences
n = 1e6
set.seed(1)
Date = sort(replicate(n, paste0(Sys.Date(), " ",
    sample(0:23, 1), ":", sample(0:59, 1), ":", sample(0:59, 1))))
Date <- as.POSIXct(Date, format="%Y-%m-%d %H:%M:%S", tz = "Europe/Paris")


diff_seq_along <- function(x) {
    x[seq_along(x)+1]-x[seq_along(x)]
}

diff_head_tail <- function(x) {
    head(tail(x, -1) - head(x, -1), -1)
}

diff_as_matrix <- function(x) {
    data.frame(diff(as.matrix(as.numeric(x))))
}

diff_length <- function(x) {
    x[-1] - x[-length(x)]
}

diff_data_table <- function(x) {
    library(data.table)
    dt <- as.data.table(x)
    setDT(dt)[, lapply(.SD, function(x) (x- shift(x))[-1])]
}

diff_dplyr <- function(x) {
    x = as.numeric(x)
    library(dplyr)
    x - lag(x) %>%
    na.omit()
}

library(microbenchmark)
bench <- microbenchmark(diff_seq_along(Date), diff_head_tail(Date), diff_as_matrix(Date), diff_length(Date), diff_data_table(Date), times = 1000)

remove_outliers <- function(x, probs = c(.25, .75), whiskers = 1.5, na.rm = FALSE, ...) {
  qnt <- quantile(x, probs = probs, na.rm = na.rm, ...)
  H <- whiskers * IQR(x, na.rm = na.rm)
  y <- x
  y[x < (qnt[1] - H)] <- NA
  y[x > (qnt[2] + H)] <- NA
  y
}
bench$timing <- sapply(bench$time, remove_outliers)


## library dplyr to compute cumulated sums
library(dplyr)
df <- bench %>%
  select(expr, timing) %>%
  group_by(expr) %>%
  mutate(timing = cumsum(timing)) %>%
  slice(which.max(timing)) %>%
  arrange(desc(timing)) # doesn't help in ggplot


# Clean names
# Vectorized version of gsub
gsub2 <- function(pattern, replacement, x, ...) {
    for(i in 1:length(pattern))
    x <- gsub(pattern[i], replacement[i], x, ...)
    x
}
key <- list("diff_seq_along(Date)" = "seq_along", "diff_head_tail(Date)" = "head_tail","diff_as_matrix(Date)" = "as_matrix", "diff_length(Date)" = "diff_length",  "diff_data_table(Date)" = "data_table")  # gsub needs fixed=TRUE to handle parenthesis
pattern <- names(key)
replacement <- unname(unlist(key))
df$expr <- gsub2(pattern, replacement, df$expr, fixed = TRUE)


library(ggplot2)
library(scales)
ggplot(data = df, aes(x = reorder(expr, -timing), y = timing, fill = expr)) +
    geom_col() +
    scale_y_continuous(breaks = NULL) +
    labs(x = "\nmethod", y = "times") -> p

# Fix x-labels
is.odd <- function(x) x%%2 != 0
labelz <- function(x) paste0(ifelse(is.odd(seq_along(x)), "", "\n\n"), x)
labels <- labelz(df$expr)

p + scale_x_discrete(labels = labels)

ggsave("benchmark-diff.pdf")
	# benchmark computing time differences
	n = 1e6
	set.seed(1)
	Date = sort(replicate(n, paste0(Sys.Date(), " ",
	sample(0:23, 1), ":", sample(0:59, 1), ":", sample(0:59, 1))))
	Date <- as.POSIXct(Date, format="%Y-%m-%d %H:%M:%S", tz = "Europe/Paris")


	diff_seq_along <- function(x) {
	x[seq_along(x)+1]-x[seq_along(x)]
	}

	diff_head_tail <- function(x) {
	head(tail(x, -1) - head(x, -1), -1)
	}

	diff_as_matrix <- function(x) {
	data.frame(diff(as.matrix(as.numeric(x))))
	}

	diff_length <- function(x) {
	x[-1] - x[-length(x)]
	}

	diff_data_table <- function(x) {
	library(data.table)
	dt <- as.data.table(x)
	setDT(dt)[, lapply(.SD, function(x) (x- shift(x))[-1])]
	}

	diff_dplyr <- function(x) {
	x = as.numeric(x)
	library(dplyr)
	x - lag(x) %>%
	na.omit()
	}

	library(microbenchmark)
	bench <- microbenchmark(diff_seq_along(Date), diff_head_tail(Date), diff_as_matrix(Date), diff_length(Date), diff_data_table(Date), times = 1000)

	remove_outliers <- function(x, probs = c(.25, .75), whiskers = 1.5, na.rm = FALSE, ...) {
	qnt <- quantile(x, probs = probs, na.rm = na.rm, ...)
	H <- whiskers * IQR(x, na.rm = na.rm)
	y <- x
	y[x < (qnt[1] - H)] <- NA
	y[x > (qnt[2] + H)] <- NA
	y
	}
	bench$timing <- sapply(bench$time, remove_outliers)


	## library dplyr to compute cumulated sums
	library(dplyr)
	df <- bench %>%
	select(expr, timing) %>%
	group_by(expr) %>%
	mutate(timing = cumsum(timing)) %>%
	slice(which.max(timing)) %>%
	arrange(desc(timing)) # doesn't help in ggplot


	# Clean names
	# Vectorized version of gsub
	gsub2 <- function(pattern, replacement, x, ...) {
	for(i in 1:length(pattern))
	x <- gsub(pattern[i], replacement[i], x, ...)
	x
	}
	key <- list("diff_seq_along(Date)" = "seq_along", "diff_head_tail(Date)" = "head_tail","diff_as_matrix(Date)" = "as_matrix", "diff_length(Date)" = "diff_length", "diff_data_table(Date)" = "data_table") # gsub needs fixed=TRUE to handle parenthesis
	pattern <- names(key)
	replacement <- unname(unlist(key))
	df$expr <- gsub2(pattern, replacement, df$expr, fixed = TRUE)


	library(ggplot2)
	library(scales)
	ggplot(data = df, aes(x = reorder(expr, -timing), y = timing, fill = expr)) +
	geom_col() +
	scale_y_continuous(breaks = NULL) +
	labs(x = "\nmethod", y = "times") -> p

	# Fix x-labels
	is.odd <- function(x) x%%2 != 0
	labelz <- function(x) paste0(ifelse(is.odd(seq_along(x)), "", "\n\n"), x)
	labels <- labelz(df$expr)

	p + scale_x_discrete(labels = labels)

	ggsave("benchmark-diff.pdf")