Skip to content

Instantly share code, notes, and snippets.

@arthurgailes
Created April 3, 2024 16:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arthurgailes/3ce9f422f03058866ad0366132a8c662 to your computer and use it in GitHub Desktop.
Save arthurgailes/3ce9f422f03058866ad0366132a8c662 to your computer and use it in GitHub Desktop.
duckplyr csv reading benchmark
# Load necessary library
pacman::p_load(
stringi, data.table, duckplyr, readr, dplyr, collapse, duckdb, dbplyr, bench,
ggplot2
)
# Function to generate a dataframe chunk
generate_data_chunk <- function(num_rows = 1000, num_cols = 100) {
# Generate numeric columns
numeric_cols <- replicate(n = num_cols/2, expr = runif(num_rows, 1, 10000), simplify = FALSE)
# Generate word columns
word_cols <- replicate(n = num_cols/2, expr = stri_rand_strings(num_rows, 10), simplify = FALSE)
# Combine word and numeric columns
data <- data.frame(matrix(ncol = num_cols, nrow = num_rows))
colnames(data) <- c(paste0("WordColumn", 1:(num_cols/2)), paste0("NumericColumn", 1:(num_cols/2)))
data[,paste0("WordColumn", 1:(num_cols/2))] <- word_cols
data[,paste0("NumericColumn", 1:(num_cols/2))] <- numeric_cols
return(data)
}
# Target file size in bytes (1GB)
target_file_size <- 1 * 1024^3
# Temporary file to accumulate data
temp_file_path <- "data/large_dataset.csv"
# Write an initial chunk to establish the file with the correct headers
initial_chunk <- generate_data_chunk()
fwrite(initial_chunk, file = temp_file_path)
# Keep appending data until the file size exceeds the target size
while(file.info(temp_file_path)$size < target_file_size) {
chunk <- generate_data_chunk()
# Append data without header and row names
fwrite(chunk, file = temp_file_path, append = TRUE, col.names = FALSE)
}
# time it takes to read, group_by, summarize
read_times <- bench::mark(
readr = {
readr::read_csv(temp_file_path, show_col_types = FALSE)
},
readr_lazy = {
readr::read_csv(temp_file_path, lazy = TRUE, show_col_types = FALSE)
},
data.table = {
data.table::fread(temp_file_path)
},
duckplyr = {
duckplyr::duckplyr_df_from_csv(temp_file_path)
},
check = FALSE
)
p <- ggplot(read_times) +
geom_col(aes(x = expression, y = median), fill = "#ff1f51", alpha = 0.5) +
hrbrthemes::theme_ipsum() +
scale_y_continuous(breaks = c(1, 5, 10, 30)) +
theme(
axis.text = element_text(size = 14),
plot.background = element_rect(fill = "#e6e6e6")
) +
labs(
title = "Time to read 1GB CSV",
x = "", y = "Total time (s)"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment