Skip to content

Instantly share code, notes, and snippets.

@jonocarroll
Last active April 14, 2016 12:13
Show Gist options
  • Save jonocarroll/119e9db260783d7b459fd8fe4636150d to your computer and use it in GitHub Desktop.
Save jonocarroll/119e9db260783d7b459fd8fe4636150d to your computer and use it in GitHub Desktop.
## Inspired by http://www.milanor.net/blog/how-to-sort-a-list-of-dataframes-in-r/
## as seen on R-bloggers http://www.r-bloggers.com/how-to-sort-a-list-of-dataframes/
##
## Blogged @ http://jcarroll.com.au/2016/04/14/simpler-isnt-always-faster/
##
## input data.frame
df_list_in <- list (
df_1 = data.frame(x = 1:5, y = 5:1),
df_2 = data.frame(x = 6:10, y = 10:6),
df_3 = data.frame(x = 11:15, y = 15:11)
)
## Michy's function
sort_list_df <- function(df_l)
{
out <- do.call(cbind, df_l)
number_of_vars <- ncol(df_l[[1]])
name_of_dfs <- names(out)
out_l <- list()
for(i in 1:number_of_vars)
{
index <- seq(i,ncol(out),number_of_vars)
tempdf = out[, names(out) %in% name_of_dfs[index]]
names(tempdf) <- names(df_l)
out_l[[i]] = tempdf
}
names(out_l) <- names(df_l[[1]])
return(out_l)
}
out1 <- sort_list_df(df_list_in)
## Michy's simpler, slower function
arrange_col <- function(dl)
{
n_col <- ncol(dl[[1]])
out_names <- names(dl[[1]])
out <- lapply(1:n_col, function(i, x) as.data.frame(lapply(x, '[[' , i = i)) , dl)
names(out) <- out_names
return(out)
}
out2 <- arrange_col(df_list_in)
## my much simpler, and not quite as slow function.
## originally I wrote this with dplyr %>% pipes
## but performance was a metric here, and it's only
## two calls, so it's not too bad to have it inside-out.
## the dplyr version would be
## df_l %>% transpose %>% lapply(as.data.frame)
library(purrr)
one_line <- function(df_l)
{
return(lapply(transpose(df_l), as.data.frame))
}
out3 <- one_line(df_list_in)
out3
# $x
# df_1 df_2 df_3
# 1 1 6 11
# 2 2 7 12
# 3 3 8 13
# 4 4 9 14
# 5 5 10 15
#
# $y
# df_1 df_2 df_3
# 1 5 10 15
# 2 4 9 14
# 3 3 8 13
# 4 2 7 12
# 5 1 6 11
## check that everything is identical
identical(out1, out2) # TRUE
identical(out2, out3) # TRUE
identical(out1, out3) # TRUE
## compare runtimes
library(microbenchmark)
benc <- microbenchmark(
sort_list_df(df_list_in),
arrange_col(df_list_in),
one_line(df_list_in),
times=500
)
print(benc)
## Unit: microseconds
## expr min lq mean median uq max neval cld
## sort_list_df(df_list_in) 240.222 274.2015 296.7311 288.8200 311.1435 540.105 500 a
## arrange_col(df_list_in) 593.445 636.9060 731.2037 660.0195 691.8250 24563.550 500 c
## one_line(df_list_in) 513.238 549.9830 580.8978 568.1580 593.4445 1205.855 500 b
## Michy's plot, re-formatted
library(ggplot2)
gg <- ggplot(benc, aes(factor(expr),log(time)))
gg <- gg + geom_violin(aes(color=expr), alpha=0.5)
gg <- gg + theme(panel.background=element_rect(fill='white', colour='black'))
gg
## add a pirateplot, because why not?
# devtools::install_github("ndphillips/yarrr")
library(yarrr)
pirateplot(formula = log(time) ~ factor(expr),
data = benc,
main = "microbenchmark:\nsort a list of data.frame",
theme.o = 2,
pal = "southpark",
ylim = c(12,14.5))
## what if it's a much bigger list of data.frames? list of 100 data.frames with 1000 obs of 50 variables
big_df_list_in <- lapply(1:100, function(x) data.frame(sapply(1:50, function(w) runif(1:1000))))
names(big_df_list_in) <- paste0("df_",1:100)
dplyr::glimpse(big_df_list_in[[1]])
# Observations: 1,000
# Variables: 50
# $ X1 (dbl) 0.2966716231, 0.3538330414, 0.0448659470, 0.2937828905, 0.7930346071, 0.4137793423, 0.7743210872, 0.3127177814, 0.656974...
# $ X2 (dbl) 0.82962028, 0.21320742, 0.95326983, 0.89924338, 0.75857729, 0.14837233, 0.74098594, 0.54081206, 0.49517866, 0.02357297, ...
# $ X3 (dbl) 0.173777247, 0.171110514, 0.189419500, 0.592488737, 0.902025229, 0.072496757, 0.948060831, 0.628953351, 0.512456346, 0.4...
# $ X4 (dbl) 0.07345945, 0.98106424, 0.26032020, 0.02724460, 0.52261013, 0.34855762, 0.93784573, 0.75675636, 0.95289076, 0.44867368, ...
# $ X5 (dbl) 0.34220361, 0.16708164, 0.04796200, 0.56744041, 0.49101299, 0.65740350, 0.16193917, 0.41767614, 0.25781740, 0.94494895, ...
# $ X6 (dbl) 0.16496589, 0.91162822, 0.75505293, 0.09460809, 0.54367284, 0.63851894, 0.35411227, 0.77503809, 0.82819200, 0.03405401, ...
big_out1 <- sort_list_df(big_df_list_in)
big_out2 <- arrange_col(big_df_list_in)
big_out3 <- one_line(big_df_list_in)
## check that everything is still identical
identical(big_out1, big_out2) # TRUE
identical(big_out2, big_out3) # TRUE
identical(big_out1, big_out3) # TRUE
## re-do the benchmark
big_benc <- microbenchmark(
sort_list_df(big_df_list_in),
arrange_col(big_df_list_in),
one_line(big_df_list_in),
times=500
)
print(big_benc)
# Unit: milliseconds
# expr min lq mean median uq max neval
# sort_list_df(big_df_list_in) 19.49822 20.0301 20.80944 20.18902 20.49014 46.14253 500
# arrange_col(big_df_list_in) 411.76263 419.7032 436.01017 423.05405 429.35310 860.03367 500
# one_line(big_df_list_in) 359.03099 368.0472 382.02442 372.35207 376.92105 794.06048 500
## add a pirateplot, because why not?
# devtools::install_github("ndphillips/yarrr")
library(yarrr)
pirateplot(formula = log(time) ~ factor(expr),
data = big_benc,
main = "microbenchmark:\nsort a list of big data.frames",
theme.o = 2,
pal = "southpark",
ylim = c(16.5,21))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment