Skip to content

Instantly share code, notes, and snippets.

@topepo
Created May 7, 2020 17:15
Show Gist options
  • Save topepo/95cc2ac90d3342f6b4cfd30d24b04bc8 to your computer and use it in GitHub Desktop.
Save topepo/95cc2ac90d3342f6b4cfd30d24b04bc8 to your computer and use it in GitHub Desktop.
library(tidymodels)
library(Matrix)
library(lobstr)
# ------------------------------------------------------------------------------
rand_values <- function (prefix = "step", len = 3, num_vals = Inf) {
candidates <- c(letters, LETTERS, paste(0:9))
candidates <- candidates[1:min(length(candidates), num_vals)]
paste(prefix, paste0(sample(candidates, len, replace = TRUE),
collapse = ""), sep = "_")
}
# ------------------------------------------------------------------------------
n <- 10000
set.seed(3462)
dat <- data.frame(x = factor(map_chr(1:n, ~ rand_values("x", num_vals = Inf))))
dn_mat <- model.matrix(~ x - 1, data = dat)
sp_mat <- sparse.model.matrix(~ x - 1, data = dat)
# Convert to compressed row-oriented form.
sp_mat_row <- as(sp_mat, "RsparseMatrix")
dim(sp_mat_row)
# ------------------------------------------------------------------------------
sbset <- function(ind, x) {
x <- x[ind,, drop = FALSE]
unname(x)
}
listed <- purrr::map(1:nrow(sp_mat_row), ~ sbset(.x, sp_mat_row))
sp_tbl <- tibble(indicators = listed)
sp_tbl
map_int(listed, ncol) %>% table()
map_int(listed, nrow) %>% table()
# ------------------------------------------------------------------------------
# overhead for separating by row:
as.numeric(obj_size(sp_tbl))/as.numeric(obj_size(sp_mat_row))
as.numeric(obj_size(listed)/obj_size(sp_mat_row))
# Is this better than the non-sparse format?
as.numeric(obj_size(listed)/obj_size(dn_mat))
# ------------------------------------------------------------------------------
system.time(reassembled <- do.call("rbind", listed))
all.equal(reassembled, sp_mat, check.attributes = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment