zachary-waller/recursive_split

## recursive_split
library(purrr)
library(rrapply)

# Split a data frame into a nested list using a different column for each level.
# This uses rrapply::rrapply() to avoid having to do any nested loops (map, lapply,
# for loop, whatever).

# Nested lists can be useful for avoiding searching through your data: the data
# has already been indexed in the list. This can be pretty handy for saving time
# if you need to do lots of filtering stuff.
# You can index nested lists in a number of ways:
# imaginary_list$level_1$level_2$level_3
# imaginary_list[["level_1"]][["level_2"]][["level_3"]]
# imaginary_list[[c("level_1", "level_2", "level_3")]]

# Functions --------------------------------------------------------------------
# Just the usual split function from base R, but takes the split factor from
# a column in the dataframe
split_function <- function(x, column){

  split(x, f = x[[column]])

}

# rrapply will apply our split function to our list no matter how nested it is.
# this means we don't have to mess around with nested loops.
# Works like base rapply but is way more flexible
recursive_split <- function(data, column){
  rrapply(
    data,
    f = split_function,
    dfaslist = FALSE,
    how = "replace",
    column = column
  )
}

# apply the recursive split function for each column.
# have to put the data in a list at first to avoid treating the data.frame as a
# list. the last step subsets the answer to cancel out this initial list-ing.
# slightly sneaky use of .init argument to make it clear that argument 1 and 2
# are different. we're essentially doing:
# f(data, column_1) %>% f(column_2) %>% f(column_3) ....
apply_recursive_split <- function(data, columns){

  data_list <- list(data)

  result <- reduce(
    .x = columns,
    .f = recursive_split,
    .init = data_list
  )

  result[[1]]
}

# rebind the data into a data.frame
rebind <- function(data){

  # we can flatten our nested list into one list of data.frames and bind together
  flat_data_list <- rrapply(
    data,
    f = identity,
    dfaslist = FALSE,
    how = "flatten"
  )

  # go along the list binding everything up
  rebound_data <- reduce(flat_data_list, rbind)

  # the original split keeps row names which we can use to reorder and be
  # identical to the original data.frame
  # pretty sneaky. won't work with tibbles I imagine
  order <- as.character(1:nrow(rebound_data))

  rebound_data[order, ]

}

# Example ----------------------------------------------------------------------
size <- 500
data <- data.frame(
  x = sample(letters[1:3], size = size, replace = TRUE),
  y = sample(letters[24:26], size = size, replace = TRUE),
  z = sample(letters[13:14], size = size, replace = TRUE),
  a = sample(letters[9:12], size = size, replace = TRUE),
  value1 = rnorm(n = size),
  value2 = rnorm(n = size, mean = 10),
  stringsAsFactors = FALSE
)

split_columns <- c("x", "y", "z", "a")

# make a big recursive list with all our data in
data_list <- apply_recursive_split(data, split_columns)

# you can do some fun things here to index it:
data_list[[c("a", "x", "m", "l")]]

# rebind the data together into a data.frame
rebound_data <- rebind(data_list)

identical(rebound_data, data)


# Tibbles ----------------------------------------------------------------------
library(tibble)

# with tibbles we lose order, but the data is still the same
data_tibble <- tibble(
  x = sample(letters[1:3], size = size, replace = TRUE),
  y = sample(letters[24:26], size = size, replace = TRUE),
  z = sample(letters[13:14], size = size, replace = TRUE),
  a = sample(letters[9:12], size = size, replace = TRUE),
  value1 = rnorm(n = size),
  value2 = rnorm(n = size, mean = 10)
)

rebound_tibble <- apply_recursive_split(data_tibble, split_columns) %>%
  rebind()

# tibbles have no row names, so we lose order!
identical(data_tibble, rebound_tibble)

# the data is still the same though
identical(
  arrange(rebound_tibble, x, y, z, a, value1, value2),
  arrange(data_tibble, x, y, z, a, value1, value2)
)
	library(purrr)
	library(rrapply)

	# Split a data frame into a nested list using a different column for each level.
	# This uses rrapply::rrapply() to avoid having to do any nested loops (map, lapply,
	# for loop, whatever).

	# Nested lists can be useful for avoiding searching through your data: the data
	# has already been indexed in the list. This can be pretty handy for saving time
	# if you need to do lots of filtering stuff.
	# You can index nested lists in a number of ways:
	# imaginary_list$level_1$level_2$level_3
	# imaginary_list[["level_1"]][["level_2"]][["level_3"]]
	# imaginary_list[[c("level_1", "level_2", "level_3")]]

	# Functions --------------------------------------------------------------------
	# Just the usual split function from base R, but takes the split factor from
	# a column in the dataframe
	split_function <- function(x, column){

	split(x, f = x[[column]])

	}

	# rrapply will apply our split function to our list no matter how nested it is.
	# this means we don't have to mess around with nested loops.
	# Works like base rapply but is way more flexible
	recursive_split <- function(data, column){
	rrapply(
	data,
	f = split_function,
	dfaslist = FALSE,
	how = "replace",
	column = column
	)
	}

	# apply the recursive split function for each column.
	# have to put the data in a list at first to avoid treating the data.frame as a
	# list. the last step subsets the answer to cancel out this initial list-ing.
	# slightly sneaky use of .init argument to make it clear that argument 1 and 2
	# are different. we're essentially doing:
	# f(data, column_1) %>% f(column_2) %>% f(column_3) ....
	apply_recursive_split <- function(data, columns){

	data_list <- list(data)

	result <- reduce(
	.x = columns,
	.f = recursive_split,
	.init = data_list
	)

	result[[1]]
	}

	# rebind the data into a data.frame
	rebind <- function(data){

	# we can flatten our nested list into one list of data.frames and bind together
	flat_data_list <- rrapply(
	data,
	f = identity,
	dfaslist = FALSE,
	how = "flatten"
	)

	# go along the list binding everything up
	rebound_data <- reduce(flat_data_list, rbind)

	# the original split keeps row names which we can use to reorder and be
	# identical to the original data.frame
	# pretty sneaky. won't work with tibbles I imagine
	order <- as.character(1:nrow(rebound_data))

	rebound_data[order, ]

	}

	# Example ----------------------------------------------------------------------
	size <- 500
	data <- data.frame(
	x = sample(letters[1:3], size = size, replace = TRUE),
	y = sample(letters[24:26], size = size, replace = TRUE),
	z = sample(letters[13:14], size = size, replace = TRUE),
	a = sample(letters[9:12], size = size, replace = TRUE),
	value1 = rnorm(n = size),
	value2 = rnorm(n = size, mean = 10),
	stringsAsFactors = FALSE
	)

	split_columns <- c("x", "y", "z", "a")

	# make a big recursive list with all our data in
	data_list <- apply_recursive_split(data, split_columns)

	# you can do some fun things here to index it:
	data_list[[c("a", "x", "m", "l")]]

	# rebind the data together into a data.frame
	rebound_data <- rebind(data_list)

	identical(rebound_data, data)


	# Tibbles ----------------------------------------------------------------------
	library(tibble)

	# with tibbles we lose order, but the data is still the same
	data_tibble <- tibble(
	x = sample(letters[1:3], size = size, replace = TRUE),
	y = sample(letters[24:26], size = size, replace = TRUE),
	z = sample(letters[13:14], size = size, replace = TRUE),
	a = sample(letters[9:12], size = size, replace = TRUE),
	value1 = rnorm(n = size),
	value2 = rnorm(n = size, mean = 10)
	)

	rebound_tibble <- apply_recursive_split(data_tibble, split_columns) %>%
	rebind()

	# tibbles have no row names, so we lose order!
	identical(data_tibble, rebound_tibble)

	# the data is still the same though
	identical(
	arrange(rebound_tibble, x, y, z, a, value1, value2),
	arrange(data_tibble, x, y, z, a, value1, value2)
	)