arthurgailes/duckplyr_read_csv.R

## duckplyr_read_csv.R
# Load necessary library
pacman::p_load(
  stringi, data.table, duckplyr, readr, dplyr, collapse, duckdb, dbplyr, bench,
  ggplot2
)

# Function to generate a dataframe chunk
generate_data_chunk <- function(num_rows = 1000, num_cols = 100) {
  # Generate numeric columns
  numeric_cols <- replicate(n = num_cols/2, expr = runif(num_rows, 1, 10000), simplify = FALSE)
  # Generate word columns
  word_cols <- replicate(n = num_cols/2, expr = stri_rand_strings(num_rows, 10), simplify = FALSE)
  # Combine word and numeric columns
  data <- data.frame(matrix(ncol = num_cols, nrow = num_rows))
  colnames(data) <- c(paste0("WordColumn", 1:(num_cols/2)), paste0("NumericColumn", 1:(num_cols/2)))
  data[,paste0("WordColumn", 1:(num_cols/2))] <- word_cols
  data[,paste0("NumericColumn", 1:(num_cols/2))] <- numeric_cols

  return(data)
}

# Target file size in bytes (1GB)
target_file_size <- 1 * 1024^3

# Temporary file to accumulate data
temp_file_path <- "data/large_dataset.csv"

# Write an initial chunk to establish the file with the correct headers
initial_chunk <- generate_data_chunk()
fwrite(initial_chunk, file = temp_file_path)

# Keep appending data until the file size exceeds the target size
while(file.info(temp_file_path)$size < target_file_size) {
  chunk <- generate_data_chunk()
  # Append data without header and row names
  fwrite(chunk, file = temp_file_path, append = TRUE, col.names = FALSE)
}

# time it takes to read, group_by, summarize
read_times <- bench::mark(
  readr = {
    readr::read_csv(temp_file_path, show_col_types = FALSE)
  },
  readr_lazy = {
    readr::read_csv(temp_file_path, lazy = TRUE, show_col_types = FALSE)
  },
  data.table = {
    data.table::fread(temp_file_path)
  },
  duckplyr = {
    duckplyr::duckplyr_df_from_csv(temp_file_path)
  },
  check = FALSE
)

p <- ggplot(read_times) +
  geom_col(aes(x = expression, y = median), fill = "#ff1f51", alpha = 0.5) +
  hrbrthemes::theme_ipsum() +
  scale_y_continuous(breaks = c(1, 5, 10, 30)) +
  theme(
    axis.text = element_text(size = 14),
    plot.background = element_rect(fill = "#e6e6e6")
  ) +
  labs(
    title = "Time to read 1GB CSV",
    x = "", y = "Total time (s)"
  )
	# Load necessary library
	pacman::p_load(
	stringi, data.table, duckplyr, readr, dplyr, collapse, duckdb, dbplyr, bench,
	ggplot2
	)

	# Function to generate a dataframe chunk
	generate_data_chunk <- function(num_rows = 1000, num_cols = 100) {
	# Generate numeric columns
	numeric_cols <- replicate(n = num_cols/2, expr = runif(num_rows, 1, 10000), simplify = FALSE)
	# Generate word columns
	word_cols <- replicate(n = num_cols/2, expr = stri_rand_strings(num_rows, 10), simplify = FALSE)
	# Combine word and numeric columns
	data <- data.frame(matrix(ncol = num_cols, nrow = num_rows))
	colnames(data) <- c(paste0("WordColumn", 1:(num_cols/2)), paste0("NumericColumn", 1:(num_cols/2)))
	data[,paste0("WordColumn", 1:(num_cols/2))] <- word_cols
	data[,paste0("NumericColumn", 1:(num_cols/2))] <- numeric_cols

	return(data)
	}

	# Target file size in bytes (1GB)
	target_file_size <- 1 * 1024^3

	# Temporary file to accumulate data
	temp_file_path <- "data/large_dataset.csv"

	# Write an initial chunk to establish the file with the correct headers
	initial_chunk <- generate_data_chunk()
	fwrite(initial_chunk, file = temp_file_path)

	# Keep appending data until the file size exceeds the target size
	while(file.info(temp_file_path)$size < target_file_size) {
	chunk <- generate_data_chunk()
	# Append data without header and row names
	fwrite(chunk, file = temp_file_path, append = TRUE, col.names = FALSE)
	}

	# time it takes to read, group_by, summarize
	read_times <- bench::mark(
	readr = {
	readr::read_csv(temp_file_path, show_col_types = FALSE)
	},
	readr_lazy = {
	readr::read_csv(temp_file_path, lazy = TRUE, show_col_types = FALSE)
	},
	data.table = {
	data.table::fread(temp_file_path)
	},
	duckplyr = {
	duckplyr::duckplyr_df_from_csv(temp_file_path)
	},
	check = FALSE
	)

	p <- ggplot(read_times) +
	geom_col(aes(x = expression, y = median), fill = "#ff1f51", alpha = 0.5) +
	hrbrthemes::theme_ipsum() +
	scale_y_continuous(breaks = c(1, 5, 10, 30)) +
	theme(
	axis.text = element_text(size = 14),
	plot.background = element_rect(fill = "#e6e6e6")
	) +
	labs(
	title = "Time to read 1GB CSV",
	x = "", y = "Total time (s)"
	)