Skip to content

Instantly share code, notes, and snippets.

@mrdwab
Last active December 6, 2020 00:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mrdwab/97243716dd7a0c11f395820a28b92e10 to your computer and use it in GitHub Desktop.
Save mrdwab/97243716dd7a0c11f395820a28b92e10 to your computer and use it in GitHub Desktop.
Testing the different options shared at https://stackoverflow.com/q/65151555/1270695
## SETUP: Sample data and packages
library(data.table)
library(readr)
library(dplyr)
library(iotools)
n <- 5000
set.seed(1)
vals_row <- sample(2000, n, TRUE)
DT <- data.table(ID = 1:n,
vals = sapply(vals_row, function(x)
paste(sample(100, x, TRUE), collapse = ";")))
DT[sample(n, n*.05), vals := NA]
## FUNCTIONS TO TEST
### Custom function to split, sum, and add the column back into the data.table
split_sum <- function(string){
string %>%
stringr::str_split(';') %>%
magrittr::extract2(1) %>%
as.double() %>%
sum()
}
fun_baraliuh <- function() {
DT %>%
mutate(vals = map_dbl(vals, split_sum))
}
### fread, forced to read a specified number of columns
fun_a5 <- function(col, sep) {
cols <- max(stringi::stri_count_fixed(col, sep), na.rm = TRUE) + 1
rowSums(fread(text = c(paste0("V_", sequence(cols), collapse = sep), col),
sep = sep, fill = TRUE, header = TRUE), na.rm = TRUE)
}
### read_delim, forced to read a specified number of columns as a specific type
fun_dave <- function(col, sep) {
cols <- max(stringi::stri_count_fixed(col, sep), na.rm = TRUE) + 1
rowSums(read_delim(c(paste0("V_", sequence(cols), collapse = sep), col),
delim = ";", col_names = TRUE,
col_types = paste(rep("i", cols), collapse = "")),
na.rm = TRUE)
}
### read.table, forced to read a specified number of columns and with other arguments
### specified for optimization of reading speeed
fun_base <- function(col, sep) {
cols <- max(stringi::stri_count_fixed(col, sep), na.rm = TRUE) + 1
rowSums(read.table(text = c(paste0("V_", sequence(cols), collapse = sep), col),
sep = sep, fill = TRUE, header = TRUE,
blank.lines.skip = FALSE, colClasses = "integer"),
na.rm = TRUE)
}
### dstrsplit, forced to read a specific number of columns
fun_iotools_d <- function(col, sep) {
cols <- max(stringi::stri_count_fixed(col, sep), na.rm = TRUE) + 1
rowSums(dstrsplit(col, col_types = rep("integer", cols), sep = sep), na.rm = TRUE)
}
### mstrsplit, forced to read a specified number of columns
fun_iotools_m <- function(col, sep) {
cols <- max(stringi::stri_count_fixed(col, sep), na.rm = TRUE) + 1
rowSums(mstrsplit(col, sep = ";", type = "integer", ncol = cols), na.rm = TRUE)
}
## BENCHMARKING
bench::mark(fun_baraliuh(), fun_a5(DT$vals, ";"), fun_base(DT$vals, ";"),
fun_dave(DT$vals, ";"), fun_iotools_d(DT$vals, ";"),
fun_iotools_m(DT$vals, ";"), check = FALSE)
# # A tibble: 6 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
# 1 fun_baraliuh() 1.06s 1.06s 0.939 71.9MB 0 1 0
# 2 fun_a5(DT$vals, ";") 268.64ms 280.84ms 3.56 151.7MB 1.78 2 1
# 3 fun_base(DT$vals, ";") 622.49ms 622.49ms 1.61 192.1MB 1.61 1 1
# 4 fun_dave(DT$vals, ";") 692.94ms 692.94ms 1.44 89.5MB 1.44 1 1
# 5 fun_iotools_d(DT$vals, ";") 528.12ms 528.12ms 1.89 77MB 0 1 0
# 6 fun_iotools_m(DT$vals, ";") 281.8ms 285.08ms 3.51 38.2MB 0 2 0
# # … with 5 more variables: total_time <bch:tm>, result <list>, memory <list>, time <list>,
# # gc <list>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment