mpettis/r-purrr-named-lists-of-dataframes-to-single-dataframe.R

## r-purrr-named-lists-of-dataframes-to-single-dataframe.R
# Ref: https://jennybc.github.io/purrr-tutorial/ls02_map-extraction-advanced.html#list_inside_a_data_frame
# Ref: https://github.com/tidyverse/tidyr/issues/22

suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(purrr))

# Make iris dataset into list of data frames split by `Species`.
my_list <- split(iris, iris$Species)
str(my_list)
#> List of 3
#>  $ setosa    :'data.frame':  50 obs. of  5 variables:
#>   ..$ Sepal.Length: num [1:50] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#>   ..$ Sepal.Width : num [1:50] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#>   ..$ Petal.Length: num [1:50] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#>   ..$ Petal.Width : num [1:50] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#>   ..$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#>  $ versicolor:'data.frame':  50 obs. of  5 variables:
#>   ..$ Sepal.Length: num [1:50] 7 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2 ...
#>   ..$ Sepal.Width : num [1:50] 3.2 3.2 3.1 2.3 2.8 2.8 3.3 2.4 2.9 2.7 ...
#>   ..$ Petal.Length: num [1:50] 4.7 4.5 4.9 4 4.6 4.5 4.7 3.3 4.6 3.9 ...
#>   ..$ Petal.Width : num [1:50] 1.4 1.5 1.5 1.3 1.5 1.3 1.6 1 1.3 1.4 ...
#>   ..$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 2 2 2 2 2 2 2 2 2 2 ...
#>  $ virginica :'data.frame':  50 obs. of  5 variables:
#>   ..$ Sepal.Length: num [1:50] 6.3 5.8 7.1 6.3 6.5 7.6 4.9 7.3 6.7 7.2 ...
#>   ..$ Sepal.Width : num [1:50] 3.3 2.7 3 2.9 3 3 2.5 2.9 2.5 3.6 ...
#>   ..$ Petal.Length: num [1:50] 6 5.1 5.9 5.6 5.8 6.6 4.5 6.3 5.8 6.1 ...
#>   ..$ Petal.Width : num [1:50] 2.5 1.9 2.1 1.8 2.2 2.1 1.7 1.8 1.8 2.5 ...
#>   ..$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 3 3 3 3 3 3 3 3 3 3 ...


# I'd like to compute a `fivenum()` on each of the numeric columns.
# First, `fivenum()` gives a vector of 5 numbers as output, but
# doesn't label them.  The documenation says what they are, but I'd
# like them in  the output as well.  Here we make a unary function
# that will compute the fivenum function, but the output structure is
# a data frame with useful column names.
fivenum2df <- . %>%
  fivenum() %>%
  set_names(c("min", "lower_hinge", "median", "upper_hinge", "max")) %>%
  as.list() %>%
  as_data_frame()


# Here we do a nested map.  The inner map (`map(fivenum2df)`) walks over the supplied
# list of columns and computes the fivenum2df function on them, creating a list of
# data frames.
#
# The outer map walks all of the different source data frames (one per Species)
# and feeds that data frame to the anonymous function in the outer map.  That
# anonymous function looks at each data frame, keeps only the numeric columns,
# feeds it to the inner map which computes a list of fivenum data frames,
# and then recombines it into a single data frame with 'bind_rows()`, with an
# added column name called 'col_nm'.
#
# Finally, each of the previous data frames (one per Species) is assembled into
# a single data frame, with a column called 'species_nm' to record which species
# it came from.
dat_fivenum <- my_list %>%
  map( ~ .x %>%
         keep(is.numeric) %>%
         map(fivenum2df) %>%
         bind_rows(.id='col_nm')) %>%
  bind_rows(.id='species_nm')

dat_fivenum
#> # A tibble: 12 x 7
#>    species_nm col_nm         min lower_hinge median upper_hinge   max
#>    <chr>      <chr>        <dbl>       <dbl>  <dbl>       <dbl> <dbl>
#>  1 setosa     Sepal.Length 4.30        4.80   5.00        5.20  5.80
#>  2 setosa     Sepal.Width  2.30        3.20   3.40        3.70  4.40
#>  3 setosa     Petal.Length 1.00        1.40   1.50        1.60  1.90
#>  4 setosa     Petal.Width  0.100       0.200  0.200       0.300 0.600
#>  5 versicolor Sepal.Length 4.90        5.60   5.90        6.30  7.00
#>  6 versicolor Sepal.Width  2.00        2.50   2.80        3.00  3.40
#>  7 versicolor Petal.Length 3.00        4.00   4.35        4.60  5.10
#>  8 versicolor Petal.Width  1.00        1.20   1.30        1.50  1.80
#>  9 virginica  Sepal.Length 4.90        6.20   6.50        6.90  7.90
#> 10 virginica  Sepal.Width  2.20        2.80   3.00        3.20  3.80
#> 11 virginica  Petal.Length 4.50        5.10   5.55        5.90  6.90
#> 12 virginica  Petal.Width  1.40        1.80   2.00        2.30  2.50
	# Ref: https://jennybc.github.io/purrr-tutorial/ls02_map-extraction-advanced.html#list_inside_a_data_frame
	# Ref: https://github.com/tidyverse/tidyr/issues/22

	suppressPackageStartupMessages(library(dplyr))
	suppressPackageStartupMessages(library(purrr))

	# Make iris dataset into list of data frames split by `Species`.
	my_list <- split(iris, iris$Species)
	str(my_list)
	#> List of 3
	#> $ setosa :'data.frame': 50 obs. of 5 variables:
	#> ..$ Sepal.Length: num [1:50] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
	#> ..$ Sepal.Width : num [1:50] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
	#> ..$ Petal.Length: num [1:50] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
	#> ..$ Petal.Width : num [1:50] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
	#> ..$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
	#> $ versicolor:'data.frame': 50 obs. of 5 variables:
	#> ..$ Sepal.Length: num [1:50] 7 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2 ...
	#> ..$ Sepal.Width : num [1:50] 3.2 3.2 3.1 2.3 2.8 2.8 3.3 2.4 2.9 2.7 ...
	#> ..$ Petal.Length: num [1:50] 4.7 4.5 4.9 4 4.6 4.5 4.7 3.3 4.6 3.9 ...
	#> ..$ Petal.Width : num [1:50] 1.4 1.5 1.5 1.3 1.5 1.3 1.6 1 1.3 1.4 ...
	#> ..$ Species : Factor w/ 3 levels "setosa","versicolor",..: 2 2 2 2 2 2 2 2 2 2 ...
	#> $ virginica :'data.frame': 50 obs. of 5 variables:
	#> ..$ Sepal.Length: num [1:50] 6.3 5.8 7.1 6.3 6.5 7.6 4.9 7.3 6.7 7.2 ...
	#> ..$ Sepal.Width : num [1:50] 3.3 2.7 3 2.9 3 3 2.5 2.9 2.5 3.6 ...
	#> ..$ Petal.Length: num [1:50] 6 5.1 5.9 5.6 5.8 6.6 4.5 6.3 5.8 6.1 ...
	#> ..$ Petal.Width : num [1:50] 2.5 1.9 2.1 1.8 2.2 2.1 1.7 1.8 1.8 2.5 ...
	#> ..$ Species : Factor w/ 3 levels "setosa","versicolor",..: 3 3 3 3 3 3 3 3 3 3 ...



	# I'd like to compute a `fivenum()` on each of the numeric columns.
	# First, `fivenum()` gives a vector of 5 numbers as output, but
	# doesn't label them. The documenation says what they are, but I'd
	# like them in the output as well. Here we make a unary function
	# that will compute the fivenum function, but the output structure is
	# a data frame with useful column names.
	fivenum2df <- . %>%
	fivenum() %>%
	set_names(c("min", "lower_hinge", "median", "upper_hinge", "max")) %>%
	as.list() %>%
	as_data_frame()



	# Here we do a nested map. The inner map (`map(fivenum2df)`) walks over the supplied
	# list of columns and computes the fivenum2df function on them, creating a list of
	# data frames.
	#
	# The outer map walks all of the different source data frames (one per Species)
	# and feeds that data frame to the anonymous function in the outer map. That
	# anonymous function looks at each data frame, keeps only the numeric columns,
	# feeds it to the inner map which computes a list of fivenum data frames,
	# and then recombines it into a single data frame with 'bind_rows()`, with an
	# added column name called 'col_nm'.
	#
	# Finally, each of the previous data frames (one per Species) is assembled into
	# a single data frame, with a column called 'species_nm' to record which species
	# it came from.
	dat_fivenum <- my_list %>%
	map( ~ .x %>%
	keep(is.numeric) %>%
	map(fivenum2df) %>%
	bind_rows(.id='col_nm')) %>%
	bind_rows(.id='species_nm')

	dat_fivenum
	#> # A tibble: 12 x 7
	#> species_nm col_nm min lower_hinge median upper_hinge max
	#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
	#> 1 setosa Sepal.Length 4.30 4.80 5.00 5.20 5.80
	#> 2 setosa Sepal.Width 2.30 3.20 3.40 3.70 4.40
	#> 3 setosa Petal.Length 1.00 1.40 1.50 1.60 1.90
	#> 4 setosa Petal.Width 0.100 0.200 0.200 0.300 0.600
	#> 5 versicolor Sepal.Length 4.90 5.60 5.90 6.30 7.00
	#> 6 versicolor Sepal.Width 2.00 2.50 2.80 3.00 3.40
	#> 7 versicolor Petal.Length 3.00 4.00 4.35 4.60 5.10
	#> 8 versicolor Petal.Width 1.00 1.20 1.30 1.50 1.80
	#> 9 virginica Sepal.Length 4.90 6.20 6.50 6.90 7.90
	#> 10 virginica Sepal.Width 2.20 2.80 3.00 3.20 3.80
	#> 11 virginica Petal.Length 4.50 5.10 5.55 5.90 6.90
	#> 12 virginica Petal.Width 1.40 1.80 2.00 2.30 2.50