Skip to content

Instantly share code, notes, and snippets.

@gongcastro
Created July 5, 2020 22:31
Show Gist options
  • Save gongcastro/35aa0ae28380fc0ff48c7df71b387b20 to your computer and use it in GitHub Desktop.
Save gongcastro/35aa0ae28380fc0ff48c7df71b387b20 to your computer and use it in GitHub Desktop.
Accompanying code for my 2020-07-05 post on how to efficiently import and merge multiple datasets in R, comparing several combinations of available functions.
#### 2020-07-05_import-multiple ###########################
# Gonzalo García-Castro, gonzalo.garciadecastro@upf.edu
# Center for Brain and Cognition, Universitat Pompeu Fabra
#### set up ###############################################
# load packages
library(dplyr)
library(tidyr)
library(ggplot2)
library(data.table)
library(here)
# set params
reps <- 100 # number of replications of the import-merge operation
n <- 50 # number of files to import
n_obs <- 10000 # number of rows per file
#### generate data ###############
filenames <- sprintf("dataset%03d", 1:n) # create dataset names
# create list of dataframes
files <- lapply(
as.list(filenames),
function(x) data.frame(dataset = x, replicate(10, sample(0:1, n_obs, rep = TRUE)))
)
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
# export dataframes in .txt. and .csv formats
mapply(function(x, y) write.table(x, paste0(tempdir(), .Platform$file.sep, y, ".txt"), sep = "\t", dec = ".", row.names = FALSE), files, filenames)
mapply(function(x, y) write.table(x, paste0(tempdir(), .Platform$file.sep, y, ".csv"), sep = ",", dec = ".", row.names = FALSE), files, filenames)
#### for loop ####################
# for loop + base
time_forloop_base_docall_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
data <- list()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
for (j in 1:length(filepaths)){
data[[j]] <- read.delim(filepaths[j])
}
data <- do.call(rbind, data)
time_forloop_base_docall_txt[i] <- Sys.time()-tic
}
time_forloop_base_docall_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
data <- list()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
for (j in 1:length(filepaths)){
data[[j]] <- read.csv(filepaths[j])
}
data <- do.call(rbind, data)
time_forloop_base_docall_csv[i] <- Sys.time()-tic
}
time_forloop_base_bindrows_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
data <- list()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
for (j in 1:length(filepaths)){
data[[j]] <- read.delim(filepaths[j])
}
data <- bind_rows(data)
time_forloop_base_bindrows_txt[i] <- Sys.time()-tic
}
time_forloop_base_bindrows_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
data <- list()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
for (j in 1:length(filepaths)){
data[[j]] <- read.csv(filepaths[j])
}
data <- bind_rows(data)
time_forloop_base_bindrows_csv[i] <- Sys.time()-tic
}
time_forloop_base_rbind_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
data <- list()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
for (j in 1:length(filepaths)){
data[[j]] <- read.delim(filepaths[j])
}
data <- rbindlist(data)
time_forloop_base_rbind_txt[i] <- Sys.time()-tic
}
time_forloop_base_rbind_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
data <- list()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
for (j in 1:length(filepaths)){
data[[j]] <- read.csv(filepaths[j])
}
data <- rbindlist(data)
time_forloop_base_rbind_csv[i] <- Sys.time()-tic
}
# for loop + readr
time_forloop_readr_docall_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
data <- list()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
for (j in 1:length(filepaths)){
data[[j]] <- readr::read_tsv(filepaths[j])
}
data <- do.call(rbind, data)
time_forloop_readr_docall_txt[i] <- Sys.time()-tic
}
time_forloop_readr_docall_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
data <- list()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
for (j in 1:length(filepaths)){
data[[j]] <- readr::read_csv(filepaths[j])
}
data <- do.call(rbind, data)
time_forloop_readr_docall_csv[i] <- Sys.time()-tic
}
time_forloop_readr_bindrows_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
data <- list()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
for (j in 1:length(filepaths)){
data[[j]] <- readr::read_tsv(filepaths[j])
}
data <- bind_rows(data)
time_forloop_readr_bindrows_txt[i] <- Sys.time()-tic
}
time_forloop_readr_bindrows_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
data <- list()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
for (j in 1:length(filepaths)){
data[[j]] <- readr::read_csv(filepaths[j])
}
data <- bind_rows(data)
time_forloop_readr_bindrows_csv[i] <- Sys.time()-tic
}
time_forloop_readr_rbind_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
data <- list()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
for (j in 1:length(filepaths)){
data[[j]] <- readr::read_tsv(filepaths[j])
}
data <- rbindlist(data)
time_forloop_readr_rbind_txt[i] <- Sys.time()-tic
}
time_forloop_readr_rbind_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
data <- list()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
for (j in 1:length(filepaths)){
data[[j]] <- readr::read_csv(filepaths[j])
}
data <- rbindlist(data)
time_forloop_readr_rbind_csv[i] <- Sys.time()-tic
}
# for loop + data.table
time_forloop_datatable_docall_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
data <- list()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
for (j in 1:length(filepaths)){
data[[j]] <- data.table::fread(filepaths[j])
}
data <- do.call(rbind, data)
time_forloop_datatable_docall_txt[i] <- Sys.time()-tic
}
time_forloop_datatable_docall_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
data <- list()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
for (j in 1:length(filepaths)){
data[[j]] <- data.table::fread(filepaths[j])
}
data <- do.call(rbind, data)
time_forloop_datatable_docall_csv[i] <- Sys.time()-tic
}
time_forloop_datatable_bindrows_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
data <- list()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
for (j in 1:length(filepaths)){
data[[j]] <- data.table::fread(filepaths[j])
}
data <- bind_rows(data)
time_forloop_datatable_bindrows_txt[i] <- Sys.time()-tic
}
time_forloop_datatable_bindrows_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
data <- list()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
for (j in 1:length(filepaths)){
data[[j]] <- data.table::fread(filepaths[j])
}
data <- bind_rows(data)
time_forloop_datatable_bindrows_csv[i] <- Sys.time()-tic
}
time_forloop_datatable_rbind_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
data <- list()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
for (j in 1:length(filepaths)){
data[[j]] <- data.table::fread(filepaths[j])
}
data <- rbindlist(data)
time_forloop_datatable_rbind_txt[i] <- Sys.time()-tic
}
time_forloop_datatable_rbind_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
data <- list()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
for (j in 1:length(filepaths)){
data[[j]] <- data.table::fread(filepaths[j])
}
data <- rbindlist(data)
time_forloop_datatable_rbind_csv[i] <- Sys.time()-tic
}
#### lapply ####################
# data.table + read.delim
time_lapply_base_docall_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
data <- lapply(filepaths, read.delim)
data <- do.call(rbind, data)
time_lapply_base_docall_txt[i] <- Sys.time()-tic
}
time_lapply_base_docall_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
data <- lapply(filepaths, read.csv)
data <- do.call(rbind, data)
time_lapply_base_docall_csv[i] <- Sys.time()-tic
}
time_lapply_base_bindrows_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
data <- lapply(filepaths, read.delim)
data <- bind_rows(data)
time_lapply_base_bindrows_txt[i] <- Sys.time()-tic
}
time_lapply_base_bindrows_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
data <- lapply(filepaths, read.csv)
data <- bind_rows(data)
time_lapply_base_bindrows_csv[i] <- Sys.time()-tic
}
time_lapply_base_rbind_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
data <- lapply(filepaths, read.delim)
data <- rbindlist(data)
time_lapply_base_rbind_txt[i] <- Sys.time()-tic
}
time_lapply_base_rbind_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
data <- lapply(filepaths, read.csv)
data <- rbindlist(data)
time_lapply_base_rbind_csv[i] <- Sys.time()-tic
}
# data.table + readr
time_lapply_readr_docall_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
data <- lapply(filepaths, readr::read_tsv)
data <- do.call(rbind, data)
time_lapply_readr_docall_txt[i] <- Sys.time()-tic
}
time_lapply_readr_docall_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
data <- lapply(filepaths, readr::read_csv)
data <- do.call(rbind, data)
time_lapply_readr_docall_csv[i] <- Sys.time()-tic
}
time_lapply_readr_bindrows_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
data <- lapply(filepaths, readr::read_tsv)
data <- bind_rows(data)
time_lapply_readr_bindrows_txt[i] <- Sys.time()-tic
}
time_lapply_readr_bindrows_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
data <- lapply(filepaths, readr::read_csv)
data <- bind_rows(data)
time_lapply_readr_bindrows_csv[i] <- Sys.time()-tic
}
time_lapply_readr_rbind_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
data <- lapply(filepaths, readr::read_tsv)
data <- rbindlist(data)
time_lapply_readr_rbind_txt[i] <- Sys.time()-tic
}
time_lapply_readr_rbind_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
data <- lapply(filepaths, readr::read_csv)
data <- rbindlist(data)
time_lapply_readr_rbind_csv[i] <- Sys.time()-tic
}
# data.table + data.table
time_lapply_datatable_docall_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
data <- lapply(filepaths, data.table::fread)
data <- do.call(rbind, data)
time_lapply_datatable_docall_txt[i] <- Sys.time()-tic
}
time_lapply_datatable_docall_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
data <- lapply(filepaths, data.table::fread)
data <- do.call(rbind, data)
time_lapply_datatable_docall_csv[i] <- Sys.time()-tic
}
time_lapply_datatable_bindrows_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
data <- lapply(filepaths, data.table::fread)
data <- bind_rows(data)
time_lapply_datatable_bindrows_txt[i] <- Sys.time()-tic
}
time_lapply_datatable_bindrows_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
data <- lapply(filepaths, data.table::fread)
data <- bind_rows(data)
time_lapply_datatable_bindrows_csv[i] <- Sys.time()-tic
}
time_lapply_datatable_rbind_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
data <- lapply(filepaths, data.table::fread)
data <- rbindlist(data)
time_lapply_datatable_rbind_txt[i] <- Sys.time()-tic
}
time_lapply_datatable_rbind_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
data <- lapply(filepaths, data.table::fread)
data <- rbindlist(data)
time_lapply_datatable_rbind_csv[i] <- Sys.time()-tic
}
#### tidy #############################
# tidy + read.delim
time_map_base_docall_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
data <- map(filepaths, read.delim) %>%
do.call(rbind, .)
time_map_base_docall_txt[i] <- Sys.time()-tic
}
time_map_base_docall_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
data <- map(filepaths, read.csv) %>%
do.call(rbind, .)
time_map_base_docall_csv[i] <- Sys.time()-tic
}
time_map_base_bindrows_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
data <- map(filepaths, read.delim) %>%
bind_rows()
time_map_base_bindrows_txt[i] <- Sys.time()-tic
}
time_map_base_bindrows_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
data <- map(filepaths, read.csv) %>%
bind_rows()
time_map_base_bindrows_csv[i] <- Sys.time()-tic
}
time_map_base_rbind_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
data <- map(filepaths, read.delim) %>%
rbindlist()
time_map_base_rbind_txt[i] <- Sys.time()-tic
}
time_map_base_rbind_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
data <- map(filepaths, read.csv) %>%
rbindlist()
time_map_base_rbind_csv[i] <- Sys.time()-tic
}
# tidy + readr
time_map_readr_docall_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
data <- map(filepaths, readr::read_csv) %>%
do.call(rbind, .)
time_map_readr_docall_txt[i] <- Sys.time()-tic
}
time_map_readr_docall_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
data <- map(filepaths, readr::read_csv) %>%
do.call(rbind, .)
time_map_readr_docall_csv[i] <- Sys.time()-tic
}
time_map_readr_bindrows_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
data <- map(filepaths, readr::read_tsv) %>%
bind_rows()
time_map_readr_bindrows_txt[i] <- Sys.time()-tic
}
time_map_readr_bindrows_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
data <- map(filepaths, readr::read_csv) %>%
bind_rows()
time_map_readr_bindrows_csv[i] <- Sys.time()-tic
}
time_map_readr_rbind_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
data <- map(filepaths, readr::read_tsv) %>%
rbindlist()
time_map_readr_rbind_txt[i] <- Sys.time()-tic
}
time_map_readr_rbind_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
data <- map(filepaths, readr::read_csv) %>%
rbindlist()
time_map_readr_rbind_csv[i] <- Sys.time()-tic
}
# tidy + data.table
time_map_datatable_docall_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
data <- map(filepaths, data.table::fread) %>%
do.call(rbind, .)
time_map_datatable_docall_txt[i] <- Sys.time()-tic
}
time_map_datatable_docall_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
data <- map(filepaths, data.table::fread) %>%
do.call(rbind, .)
time_map_datatable_docall_csv[i] <- Sys.time()-tic
}
time_map_datatable_bindrows_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
data <- map(filepaths, data.table::fread) %>%
bind_rows()
time_map_datatable_bindrows_txt[i] <- Sys.time()-tic
}
time_map_datatable_bindrows_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
data <- map(filepaths, data.table::fread) %>%
bind_rows()
time_map_datatable_bindrows_csv[i] <- Sys.time()-tic
}
time_map_datatable_rbind_txt <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".txt")
data <- map(filepaths, data.table::fread) %>%
rbindlist()
time_map_datatable_rbind_txt[i] <- Sys.time()-tic
}
time_map_datatable_rbind_csv <- c()
for (i in 1:reps){
tic <- Sys.time()
filepaths <- list.files(tempdir(), full.names = TRUE, pattern = ".csv")
data <- map(filepaths, data.table::fread) %>%
rbindlist()
time_map_datatable_rbind_csv[i] <- Sys.time()-tic
}
scores <- list(
time_forloop_base_docall_txt, time_forloop_base_docall_csv,
time_forloop_base_bindrows_txt, time_forloop_base_bindrows_csv,
time_forloop_base_rbind_txt, time_forloop_base_rbind_csv,
time_forloop_readr_docall_txt, time_forloop_readr_docall_csv,
time_forloop_readr_bindrows_txt, time_forloop_readr_bindrows_txt,
time_forloop_readr_rbind_txt, time_forloop_readr_rbind_txt,
time_forloop_datatable_docall_txt, time_forloop_datatable_docall_csv,
time_forloop_datatable_bindrows_txt, time_forloop_datatable_bindrows_csv,
time_forloop_datatable_rbind_txt, time_forloop_datatable_rbind_txt,
time_lapply_base_docall_txt, time_lapply_base_docall_csv,
time_lapply_base_bindrows_txt, time_lapply_base_bindrows_csv,
time_lapply_base_rbind_txt, time_lapply_base_rbind_csv,
time_lapply_readr_docall_txt, time_lapply_readr_docall_csv,
time_lapply_readr_bindrows_txt, time_lapply_readr_bindrows_txt,
time_lapply_readr_rbind_txt, time_lapply_readr_rbind_txt,
time_lapply_datatable_docall_txt, time_lapply_datatable_docall_csv,
time_lapply_datatable_bindrows_txt, time_lapply_datatable_bindrows_csv,
time_lapply_datatable_rbind_txt, time_lapply_datatable_rbind_txt,
time_map_base_docall_txt, time_map_base_docall_csv,
time_map_base_bindrows_txt, time_map_base_bindrows_csv,
time_map_base_rbind_txt, time_map_base_rbind_csv,
time_map_readr_docall_txt, time_map_readr_docall_csv,
time_map_readr_bindrows_txt, time_map_readr_bindrows_txt,
time_map_readr_rbind_txt, time_map_readr_rbind_txt,
time_map_datatable_docall_txt, time_map_datatable_docall_csv,
time_map_datatable_bindrows_txt, time_map_datatable_bindrows_csv,
time_map_datatable_rbind_txt, time_map_datatable_rbind_txt
)
#### merge all scores #################
times <- expand.grid(package = c("base", "readr", "data.table"), format = c(".csv", ".txt"), vectorisation = c("for loop", "lapply", "purrr::map"), merge = c("do.call", "dplyr::bind_rows", "data.table::rbindList")) %>%
mutate(time = scores) %>%
unnest(time)
#### export data ######################
fwrite(times, here("static", "data", "2020-07-05_import-multiple.txt"), sep = "\t")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment