Last active
April 14, 2016 12:13
-
-
Save jonocarroll/119e9db260783d7b459fd8fe4636150d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Inspired by http://www.milanor.net/blog/how-to-sort-a-list-of-dataframes-in-r/ | |
## as seen on R-bloggers http://www.r-bloggers.com/how-to-sort-a-list-of-dataframes/ | |
## | |
## Blogged @ http://jcarroll.com.au/2016/04/14/simpler-isnt-always-faster/ | |
## | |
## input data.frame | |
df_list_in <- list ( | |
df_1 = data.frame(x = 1:5, y = 5:1), | |
df_2 = data.frame(x = 6:10, y = 10:6), | |
df_3 = data.frame(x = 11:15, y = 15:11) | |
) | |
## Michy's function | |
sort_list_df <- function(df_l) | |
{ | |
out <- do.call(cbind, df_l) | |
number_of_vars <- ncol(df_l[[1]]) | |
name_of_dfs <- names(out) | |
out_l <- list() | |
for(i in 1:number_of_vars) | |
{ | |
index <- seq(i,ncol(out),number_of_vars) | |
tempdf = out[, names(out) %in% name_of_dfs[index]] | |
names(tempdf) <- names(df_l) | |
out_l[[i]] = tempdf | |
} | |
names(out_l) <- names(df_l[[1]]) | |
return(out_l) | |
} | |
out1 <- sort_list_df(df_list_in) | |
## Michy's simpler, slower function | |
arrange_col <- function(dl) | |
{ | |
n_col <- ncol(dl[[1]]) | |
out_names <- names(dl[[1]]) | |
out <- lapply(1:n_col, function(i, x) as.data.frame(lapply(x, '[[' , i = i)) , dl) | |
names(out) <- out_names | |
return(out) | |
} | |
out2 <- arrange_col(df_list_in) | |
## my much simpler, and not quite as slow function. | |
## originally I wrote this with dplyr %>% pipes | |
## but performance was a metric here, and it's only | |
## two calls, so it's not too bad to have it inside-out. | |
## the dplyr version would be | |
## df_l %>% transpose %>% lapply(as.data.frame) | |
library(purrr) | |
one_line <- function(df_l) | |
{ | |
return(lapply(transpose(df_l), as.data.frame)) | |
} | |
out3 <- one_line(df_list_in) | |
out3 | |
# $x | |
# df_1 df_2 df_3 | |
# 1 1 6 11 | |
# 2 2 7 12 | |
# 3 3 8 13 | |
# 4 4 9 14 | |
# 5 5 10 15 | |
# | |
# $y | |
# df_1 df_2 df_3 | |
# 1 5 10 15 | |
# 2 4 9 14 | |
# 3 3 8 13 | |
# 4 2 7 12 | |
# 5 1 6 11 | |
## check that everything is identical | |
identical(out1, out2) # TRUE | |
identical(out2, out3) # TRUE | |
identical(out1, out3) # TRUE | |
## compare runtimes | |
library(microbenchmark) | |
benc <- microbenchmark( | |
sort_list_df(df_list_in), | |
arrange_col(df_list_in), | |
one_line(df_list_in), | |
times=500 | |
) | |
print(benc) | |
## Unit: microseconds | |
## expr min lq mean median uq max neval cld | |
## sort_list_df(df_list_in) 240.222 274.2015 296.7311 288.8200 311.1435 540.105 500 a | |
## arrange_col(df_list_in) 593.445 636.9060 731.2037 660.0195 691.8250 24563.550 500 c | |
## one_line(df_list_in) 513.238 549.9830 580.8978 568.1580 593.4445 1205.855 500 b | |
## Michy's plot, re-formatted | |
library(ggplot2) | |
gg <- ggplot(benc, aes(factor(expr),log(time))) | |
gg <- gg + geom_violin(aes(color=expr), alpha=0.5) | |
gg <- gg + theme(panel.background=element_rect(fill='white', colour='black')) | |
gg | |
## add a pirateplot, because why not? | |
# devtools::install_github("ndphillips/yarrr") | |
library(yarrr) | |
pirateplot(formula = log(time) ~ factor(expr), | |
data = benc, | |
main = "microbenchmark:\nsort a list of data.frame", | |
theme.o = 2, | |
pal = "southpark", | |
ylim = c(12,14.5)) | |
## what if it's a much bigger list of data.frames? list of 100 data.frames with 1000 obs of 50 variables | |
big_df_list_in <- lapply(1:100, function(x) data.frame(sapply(1:50, function(w) runif(1:1000)))) | |
names(big_df_list_in) <- paste0("df_",1:100) | |
dplyr::glimpse(big_df_list_in[[1]]) | |
# Observations: 1,000 | |
# Variables: 50 | |
# $ X1 (dbl) 0.2966716231, 0.3538330414, 0.0448659470, 0.2937828905, 0.7930346071, 0.4137793423, 0.7743210872, 0.3127177814, 0.656974... | |
# $ X2 (dbl) 0.82962028, 0.21320742, 0.95326983, 0.89924338, 0.75857729, 0.14837233, 0.74098594, 0.54081206, 0.49517866, 0.02357297, ... | |
# $ X3 (dbl) 0.173777247, 0.171110514, 0.189419500, 0.592488737, 0.902025229, 0.072496757, 0.948060831, 0.628953351, 0.512456346, 0.4... | |
# $ X4 (dbl) 0.07345945, 0.98106424, 0.26032020, 0.02724460, 0.52261013, 0.34855762, 0.93784573, 0.75675636, 0.95289076, 0.44867368, ... | |
# $ X5 (dbl) 0.34220361, 0.16708164, 0.04796200, 0.56744041, 0.49101299, 0.65740350, 0.16193917, 0.41767614, 0.25781740, 0.94494895, ... | |
# $ X6 (dbl) 0.16496589, 0.91162822, 0.75505293, 0.09460809, 0.54367284, 0.63851894, 0.35411227, 0.77503809, 0.82819200, 0.03405401, ... | |
big_out1 <- sort_list_df(big_df_list_in) | |
big_out2 <- arrange_col(big_df_list_in) | |
big_out3 <- one_line(big_df_list_in) | |
## check that everything is still identical | |
identical(big_out1, big_out2) # TRUE | |
identical(big_out2, big_out3) # TRUE | |
identical(big_out1, big_out3) # TRUE | |
## re-do the benchmark | |
big_benc <- microbenchmark( | |
sort_list_df(big_df_list_in), | |
arrange_col(big_df_list_in), | |
one_line(big_df_list_in), | |
times=500 | |
) | |
print(big_benc) | |
# Unit: milliseconds | |
# expr min lq mean median uq max neval | |
# sort_list_df(big_df_list_in) 19.49822 20.0301 20.80944 20.18902 20.49014 46.14253 500 | |
# arrange_col(big_df_list_in) 411.76263 419.7032 436.01017 423.05405 429.35310 860.03367 500 | |
# one_line(big_df_list_in) 359.03099 368.0472 382.02442 372.35207 376.92105 794.06048 500 | |
## add a pirateplot, because why not? | |
# devtools::install_github("ndphillips/yarrr") | |
library(yarrr) | |
pirateplot(formula = log(time) ~ factor(expr), | |
data = big_benc, | |
main = "microbenchmark:\nsort a list of big data.frames", | |
theme.o = 2, | |
pal = "southpark", | |
ylim = c(16.5,21)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment