Skip to content

Instantly share code, notes, and snippets.

@zachary-waller
Last active November 1, 2020 01:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save zachary-waller/7e725e3d6b3155da15b485c73439a783 to your computer and use it in GitHub Desktop.
Save zachary-waller/7e725e3d6b3155da15b485c73439a783 to your computer and use it in GitHub Desktop.
Recursive Split: Split a dataframe into a nested list and reassemble back into a dataframe
library(purrr)
library(rrapply)
# Split a data frame into a nested list using a different column for each level.
# This uses rrapply::rrapply() to avoid having to do any nested loops (map, lapply,
# for loop, whatever).
# Nested lists can be useful for avoiding searching through your data: the data
# has already been indexed in the list. This can be pretty handy for saving time
# if you need to do lots of filtering stuff.
# You can index nested lists in a number of ways:
# imaginary_list$level_1$level_2$level_3
# imaginary_list[["level_1"]][["level_2"]][["level_3"]]
# imaginary_list[[c("level_1", "level_2", "level_3")]]
# Functions --------------------------------------------------------------------
# Just the usual split function from base R, but takes the split factor from
# a column in the dataframe
split_function <- function(x, column){
split(x, f = x[[column]])
}
# rrapply will apply our split function to our list no matter how nested it is.
# this means we don't have to mess around with nested loops.
# Works like base rapply but is way more flexible
recursive_split <- function(data, column){
rrapply(
data,
f = split_function,
dfaslist = FALSE,
how = "replace",
column = column
)
}
# apply the recursive split function for each column.
# have to put the data in a list at first to avoid treating the data.frame as a
# list. the last step subsets the answer to cancel out this initial list-ing.
# slightly sneaky use of .init argument to make it clear that argument 1 and 2
# are different. we're essentially doing:
# f(data, column_1) %>% f(column_2) %>% f(column_3) ....
apply_recursive_split <- function(data, columns){
data_list <- list(data)
result <- reduce(
.x = columns,
.f = recursive_split,
.init = data_list
)
result[[1]]
}
# rebind the data into a data.frame
rebind <- function(data){
# we can flatten our nested list into one list of data.frames and bind together
flat_data_list <- rrapply(
data,
f = identity,
dfaslist = FALSE,
how = "flatten"
)
# go along the list binding everything up
rebound_data <- reduce(flat_data_list, rbind)
# the original split keeps row names which we can use to reorder and be
# identical to the original data.frame
# pretty sneaky. won't work with tibbles I imagine
order <- as.character(1:nrow(rebound_data))
rebound_data[order, ]
}
# Example ----------------------------------------------------------------------
size <- 500
data <- data.frame(
x = sample(letters[1:3], size = size, replace = TRUE),
y = sample(letters[24:26], size = size, replace = TRUE),
z = sample(letters[13:14], size = size, replace = TRUE),
a = sample(letters[9:12], size = size, replace = TRUE),
value1 = rnorm(n = size),
value2 = rnorm(n = size, mean = 10),
stringsAsFactors = FALSE
)
split_columns <- c("x", "y", "z", "a")
# make a big recursive list with all our data in
data_list <- apply_recursive_split(data, split_columns)
# you can do some fun things here to index it:
data_list[[c("a", "x", "m", "l")]]
# rebind the data together into a data.frame
rebound_data <- rebind(data_list)
identical(rebound_data, data)
# Tibbles ----------------------------------------------------------------------
library(tibble)
# with tibbles we lose order, but the data is still the same
data_tibble <- tibble(
x = sample(letters[1:3], size = size, replace = TRUE),
y = sample(letters[24:26], size = size, replace = TRUE),
z = sample(letters[13:14], size = size, replace = TRUE),
a = sample(letters[9:12], size = size, replace = TRUE),
value1 = rnorm(n = size),
value2 = rnorm(n = size, mean = 10)
)
rebound_tibble <- apply_recursive_split(data_tibble, split_columns) %>%
rebind()
# tibbles have no row names, so we lose order!
identical(data_tibble, rebound_tibble)
# the data is still the same though
identical(
arrange(rebound_tibble, x, y, z, a, value1, value2),
arrange(data_tibble, x, y, z, a, value1, value2)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment