jonocarroll/sortdf.R

## sortdf.R
## Inspired by http://www.milanor.net/blog/how-to-sort-a-list-of-dataframes-in-r/
## as seen on R-bloggers http://www.r-bloggers.com/how-to-sort-a-list-of-dataframes/
##
## Blogged @ http://jcarroll.com.au/2016/04/14/simpler-isnt-always-faster/
##

## input data.frame
df_list_in <- list (
  df_1 = data.frame(x = 1:5,   y = 5:1),
  df_2 = data.frame(x = 6:10,  y = 10:6),
  df_3 = data.frame(x = 11:15, y = 15:11)
)

## Michy's function
sort_list_df <- function(df_l)
{
  out <- do.call(cbind, df_l)
  number_of_vars <- ncol(df_l[[1]])
  name_of_dfs <- names(out)
  out_l <- list()

  for(i in 1:number_of_vars)
  {
    index <- seq(i,ncol(out),number_of_vars)
    tempdf = out[, names(out) %in% name_of_dfs[index]]
    names(tempdf) <- names(df_l)
    out_l[[i]] = tempdf
  }

  names(out_l) <- names(df_l[[1]])
  return(out_l)
}
out1 <- sort_list_df(df_list_in)

## Michy's simpler, slower function
arrange_col <- function(dl)
{
  n_col <- ncol(dl[[1]])
  out_names <- names(dl[[1]])
  out <- lapply(1:n_col, function(i, x) as.data.frame(lapply(x, '[[' , i = i)) , dl)
  names(out) <- out_names
  return(out)
}
out2 <- arrange_col(df_list_in)

## my much simpler, and not quite as slow function.
## originally I wrote this with dplyr %>% pipes
## but performance was a metric here, and it's only
## two calls, so it's not too bad to have it inside-out.
## the dplyr version would be
## df_l %>% transpose %>% lapply(as.data.frame)
library(purrr)
one_line <- function(df_l)
{
  return(lapply(transpose(df_l), as.data.frame))
}
out3 <- one_line(df_list_in)

out3
# $x
# df_1 df_2 df_3
# 1    1    6   11
# 2    2    7   12
# 3    3    8   13
# 4    4    9   14
# 5    5   10   15
#
# $y
# df_1 df_2 df_3
# 1    5   10   15
# 2    4    9   14
# 3    3    8   13
# 4    2    7   12
# 5    1    6   11

## check that everything is identical
identical(out1, out2) # TRUE
identical(out2, out3) # TRUE
identical(out1, out3) # TRUE

## compare runtimes
library(microbenchmark)
benc <- microbenchmark(
  sort_list_df(df_list_in),
  arrange_col(df_list_in),
  one_line(df_list_in),
  times=500
)
print(benc)
## Unit: microseconds
##                     expr     min       lq     mean   median       uq       max neval cld
## sort_list_df(df_list_in) 240.222 274.2015 296.7311 288.8200 311.1435   540.105   500 a
## arrange_col(df_list_in)  593.445 636.9060 731.2037 660.0195 691.8250 24563.550   500   c
## one_line(df_list_in)     513.238 549.9830 580.8978 568.1580 593.4445  1205.855   500  b

## Michy's plot, re-formatted
library(ggplot2)
gg <- ggplot(benc, aes(factor(expr),log(time)))
gg <- gg + geom_violin(aes(color=expr), alpha=0.5)
gg <- gg + theme(panel.background=element_rect(fill='white', colour='black'))
gg

## add a pirateplot, because why not?
# devtools::install_github("ndphillips/yarrr")
library(yarrr)
pirateplot(formula = log(time) ~ factor(expr),
           data    = benc,
           main    = "microbenchmark:\nsort a list of data.frame",
           theme.o = 2,
           pal     = "southpark",
           ylim    = c(12,14.5))

## what if it's a much bigger list of data.frames? list of 100 data.frames with 1000 obs of 50 variables
big_df_list_in <- lapply(1:100, function(x) data.frame(sapply(1:50, function(w) runif(1:1000))))
names(big_df_list_in) <- paste0("df_",1:100)

dplyr::glimpse(big_df_list_in[[1]])
# Observations: 1,000
# Variables: 50
# $ X1  (dbl) 0.2966716231, 0.3538330414, 0.0448659470, 0.2937828905, 0.7930346071, 0.4137793423, 0.7743210872, 0.3127177814, 0.656974...
# $ X2  (dbl) 0.82962028, 0.21320742, 0.95326983, 0.89924338, 0.75857729, 0.14837233, 0.74098594, 0.54081206, 0.49517866, 0.02357297, ...
# $ X3  (dbl) 0.173777247, 0.171110514, 0.189419500, 0.592488737, 0.902025229, 0.072496757, 0.948060831, 0.628953351, 0.512456346, 0.4...
# $ X4  (dbl) 0.07345945, 0.98106424, 0.26032020, 0.02724460, 0.52261013, 0.34855762, 0.93784573, 0.75675636, 0.95289076, 0.44867368, ...
# $ X5  (dbl) 0.34220361, 0.16708164, 0.04796200, 0.56744041, 0.49101299, 0.65740350, 0.16193917, 0.41767614, 0.25781740, 0.94494895, ...
# $ X6  (dbl) 0.16496589, 0.91162822, 0.75505293, 0.09460809, 0.54367284, 0.63851894, 0.35411227, 0.77503809, 0.82819200, 0.03405401, ...

big_out1 <- sort_list_df(big_df_list_in)
big_out2 <- arrange_col(big_df_list_in)
big_out3 <- one_line(big_df_list_in)

## check that everything is still identical
identical(big_out1, big_out2) # TRUE
identical(big_out2, big_out3) # TRUE
identical(big_out1, big_out3) # TRUE

## re-do the benchmark
big_benc <- microbenchmark(
  sort_list_df(big_df_list_in),
  arrange_col(big_df_list_in),
  one_line(big_df_list_in),
  times=500
)
print(big_benc)
# Unit: milliseconds
#                         expr       min       lq      mean    median        uq       max neval
# sort_list_df(big_df_list_in)  19.49822  20.0301  20.80944  20.18902  20.49014  46.14253   500
#  arrange_col(big_df_list_in) 411.76263 419.7032 436.01017 423.05405 429.35310 860.03367   500
#     one_line(big_df_list_in) 359.03099 368.0472 382.02442 372.35207 376.92105 794.06048   500

## add a pirateplot, because why not?
# devtools::install_github("ndphillips/yarrr")
library(yarrr)
pirateplot(formula = log(time) ~ factor(expr),
           data    = big_benc,
           main    = "microbenchmark:\nsort a list of big data.frames",
           theme.o = 2,
           pal     = "southpark",
           ylim    = c(16.5,21))
	## Inspired by http://www.milanor.net/blog/how-to-sort-a-list-of-dataframes-in-r/
	## as seen on R-bloggers http://www.r-bloggers.com/how-to-sort-a-list-of-dataframes/
	##
	## Blogged @ http://jcarroll.com.au/2016/04/14/simpler-isnt-always-faster/
	##

	## input data.frame
	df_list_in <- list (
	df_1 = data.frame(x = 1:5, y = 5:1),
	df_2 = data.frame(x = 6:10, y = 10:6),
	df_3 = data.frame(x = 11:15, y = 15:11)
	)

	## Michy's function
	sort_list_df <- function(df_l)
	{
	out <- do.call(cbind, df_l)
	number_of_vars <- ncol(df_l[[1]])
	name_of_dfs <- names(out)
	out_l <- list()

	for(i in 1:number_of_vars)
	{
	index <- seq(i,ncol(out),number_of_vars)
	tempdf = out[, names(out) %in% name_of_dfs[index]]
	names(tempdf) <- names(df_l)
	out_l[[i]] = tempdf
	}

	names(out_l) <- names(df_l[[1]])
	return(out_l)
	}
	out1 <- sort_list_df(df_list_in)

	## Michy's simpler, slower function
	arrange_col <- function(dl)
	{
	n_col <- ncol(dl[[1]])
	out_names <- names(dl[[1]])
	out <- lapply(1:n_col, function(i, x) as.data.frame(lapply(x, '[[' , i = i)) , dl)
	names(out) <- out_names
	return(out)
	}
	out2 <- arrange_col(df_list_in)

	## my much simpler, and not quite as slow function.
	## originally I wrote this with dplyr %>% pipes
	## but performance was a metric here, and it's only
	## two calls, so it's not too bad to have it inside-out.
	## the dplyr version would be
	## df_l %>% transpose %>% lapply(as.data.frame)
	library(purrr)
	one_line <- function(df_l)
	{
	return(lapply(transpose(df_l), as.data.frame))
	}
	out3 <- one_line(df_list_in)

	out3
	# $x
	# df_1 df_2 df_3
	# 1 1 6 11
	# 2 2 7 12
	# 3 3 8 13
	# 4 4 9 14
	# 5 5 10 15
	#
	# $y
	# df_1 df_2 df_3
	# 1 5 10 15
	# 2 4 9 14
	# 3 3 8 13
	# 4 2 7 12
	# 5 1 6 11

	## check that everything is identical
	identical(out1, out2) # TRUE
	identical(out2, out3) # TRUE
	identical(out1, out3) # TRUE

	## compare runtimes
	library(microbenchmark)
	benc <- microbenchmark(
	sort_list_df(df_list_in),
	arrange_col(df_list_in),
	one_line(df_list_in),
	times=500
	)
	print(benc)
	## Unit: microseconds
	## expr min lq mean median uq max neval cld
	## sort_list_df(df_list_in) 240.222 274.2015 296.7311 288.8200 311.1435 540.105 500 a
	## arrange_col(df_list_in) 593.445 636.9060 731.2037 660.0195 691.8250 24563.550 500 c
	## one_line(df_list_in) 513.238 549.9830 580.8978 568.1580 593.4445 1205.855 500 b

	## Michy's plot, re-formatted
	library(ggplot2)
	gg <- ggplot(benc, aes(factor(expr),log(time)))
	gg <- gg + geom_violin(aes(color=expr), alpha=0.5)
	gg <- gg + theme(panel.background=element_rect(fill='white', colour='black'))
	gg

	## add a pirateplot, because why not?
	# devtools::install_github("ndphillips/yarrr")
	library(yarrr)
	pirateplot(formula = log(time) ~ factor(expr),
	data = benc,
	main = "microbenchmark:\nsort a list of data.frame",
	theme.o = 2,
	pal = "southpark",
	ylim = c(12,14.5))

	## what if it's a much bigger list of data.frames? list of 100 data.frames with 1000 obs of 50 variables
	big_df_list_in <- lapply(1:100, function(x) data.frame(sapply(1:50, function(w) runif(1:1000))))
	names(big_df_list_in) <- paste0("df_",1:100)

	dplyr::glimpse(big_df_list_in[[1]])
	# Observations: 1,000
	# Variables: 50
	# $ X1 (dbl) 0.2966716231, 0.3538330414, 0.0448659470, 0.2937828905, 0.7930346071, 0.4137793423, 0.7743210872, 0.3127177814, 0.656974...
	# $ X2 (dbl) 0.82962028, 0.21320742, 0.95326983, 0.89924338, 0.75857729, 0.14837233, 0.74098594, 0.54081206, 0.49517866, 0.02357297, ...
	# $ X3 (dbl) 0.173777247, 0.171110514, 0.189419500, 0.592488737, 0.902025229, 0.072496757, 0.948060831, 0.628953351, 0.512456346, 0.4...
	# $ X4 (dbl) 0.07345945, 0.98106424, 0.26032020, 0.02724460, 0.52261013, 0.34855762, 0.93784573, 0.75675636, 0.95289076, 0.44867368, ...
	# $ X5 (dbl) 0.34220361, 0.16708164, 0.04796200, 0.56744041, 0.49101299, 0.65740350, 0.16193917, 0.41767614, 0.25781740, 0.94494895, ...
	# $ X6 (dbl) 0.16496589, 0.91162822, 0.75505293, 0.09460809, 0.54367284, 0.63851894, 0.35411227, 0.77503809, 0.82819200, 0.03405401, ...

	big_out1 <- sort_list_df(big_df_list_in)
	big_out2 <- arrange_col(big_df_list_in)
	big_out3 <- one_line(big_df_list_in)

	## check that everything is still identical
	identical(big_out1, big_out2) # TRUE
	identical(big_out2, big_out3) # TRUE
	identical(big_out1, big_out3) # TRUE

	## re-do the benchmark
	big_benc <- microbenchmark(
	sort_list_df(big_df_list_in),
	arrange_col(big_df_list_in),
	one_line(big_df_list_in),
	times=500
	)
	print(big_benc)
	# Unit: milliseconds
	# expr min lq mean median uq max neval
	# sort_list_df(big_df_list_in) 19.49822 20.0301 20.80944 20.18902 20.49014 46.14253 500
	# arrange_col(big_df_list_in) 411.76263 419.7032 436.01017 423.05405 429.35310 860.03367 500
	# one_line(big_df_list_in) 359.03099 368.0472 382.02442 372.35207 376.92105 794.06048 500

	## add a pirateplot, because why not?
	# devtools::install_github("ndphillips/yarrr")
	library(yarrr)
	pirateplot(formula = log(time) ~ factor(expr),
	data = big_benc,
	main = "microbenchmark:\nsort a list of big data.frames",
	theme.o = 2,
	pal = "southpark",
	ylim = c(16.5,21))