Robert Kubinec June 7, 2018
This Rmarkdown describes the create_array
function that can be used to convert long data frames to arrays. This function is pipe-able and is made to work within tidy
workflows.
Function code is:
create_array <- function(input_matrix,arr_dim=2,row_var=NULL,
col_var_name=NULL,
col_var_value,third_dim_var=NULL) {
if('matrix' %in% class(input_matrix)) {
# if just a matrix, rep it to hit array dims
rep_matrix <- rep(c(input_matrix),arr_dim)
out_array <- array(rep_matrix,dim=c(dim(input_matrix),arr_dim))
} else if('data.frame' %in% class(input_matrix)) {
# assuming data is in long form, select and then spread the bugger
row_var <- enquo(row_var)
col_var_name <- enquo(col_var_name)
col_var_value <- enquo(col_var_value)
third_dim_var <- enquo(third_dim_var)
to_spread <- ungroup(input_matrix) %>% select(!!row_var,!!col_var_name,!!third_dim_var,!!col_var_value)
# figure out how big this array should be
arr_dim <- length(unique(pull(to_spread,!!third_dim_var)))
if(!(nrow(distinct(to_spread))==nrow(to_spread))) stop('Each row in the data must be uniquely identified given row_var, col_var and third_dim_var.')
to_array <- lapply(split(to_spread,pull(to_spread,!!third_dim_var)), function(this_data) {
# spread and stuff into a list
spread_it <- spread(this_data,key=!!col_var_name,value=!!col_var_value) %>%
select(-!!row_var,-!!third_dim_var) %>% as.matrix
row.names(spread_it) <- unique(pull(this_data,!!row_var))
return(spread_it)
})
# convert to a vector before array-ing it
long_vec <- c(do.call(c,to_array))
# BOOM
out_array <- array(long_vec,
dim=c(dim(to_array[[1]]),arr_dim),
dimnames=list(row.names=row.names(to_array[[1]]),
colnames=colnames(to_array[[1]]),
stack=unique(pull(to_spread,!!third_dim_var))))
}
return(out_array)
}
I demonstrate how it works with the iris
data:
data('iris')
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
The main issue with this data is there is no unique row identifier within species. We'll add one, which could indicate where each plant is located.
iris2 <- group_by(iris,Species) %>%
mutate(id=1:n())
head(iris2)
## # A tibble: 6 x 6
## # Groups: Species [1]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species id
## <dbl> <dbl> <dbl> <dbl> <fct> <int>
## 1 5.10 3.50 1.40 0.200 setosa 1
## 2 4.90 3.00 1.40 0.200 setosa 2
## 3 4.70 3.20 1.30 0.200 setosa 3
## 4 4.60 3.10 1.50 0.200 setosa 4
## 5 5.00 3.60 1.40 0.200 setosa 5
## 6 5.40 3.90 1.70 0.400 setosa 6
We will then select all Sepal
columns to use for the cell values of the array by converting them to long mode while we allow id
and Species
to repeat for each row:
iris_long <- gather(iris2,key=columns,value=array_cell_value,-Species,-id)
head(iris_long)
## # A tibble: 6 x 4
## # Groups: Species [1]
## Species id columns array_cell_value
## <fct> <int> <chr> <dbl>
## 1 setosa 1 Sepal.Length 5.10
## 2 setosa 2 Sepal.Length 4.90
## 3 setosa 3 Sepal.Length 4.70
## 4 setosa 4 Sepal.Length 4.60
## 5 setosa 5 Sepal.Length 5.00
## 6 setosa 6 Sepal.Length 5.40
We now have 4 columns, which is what we need to convert to an array. The first dimension of the array will be id
, the second columns
, the third Species
, while array_cell_value
will populate the values of the array. Once we have these four columns we can just plug and play into the create_array
function:
iris_array <- create_array(input_matrix = iris_long,row_var = id,
col_var_name = columns,
col_var_value = array_cell_value,
third_dim_var = Species)
We can test that these are the same by selecting a column from the array and from the long data and testing for equality:
# test that one Species-column combination is the same for all locations
test_col <- filter(iris_long,Species=='setosa',columns=='Sepal.Length') %>% pull(array_cell_value)
all(test_col==iris_array[,'Sepal.Length','setosa'])
## [1] TRUE
# test that one location id is the same
test_col <- filter(iris_long,id==1) %>% arrange(Species,columns) %>% pull(array_cell_value)
all(test_col==c(iris_array[1,,]))
## [1] TRUE
We can see that the array has the correct dimensions and also dimension names:
str(iris_array)
## num [1:50, 1:4, 1:3] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## - attr(*, "dimnames")=List of 3
## ..$ row.names: chr [1:50] "1" "2" "3" "4" ...
## ..$ colnames : chr [1:4] "Petal.Length" "Petal.Width" "Sepal.Length" "Sepal.Width"
## ..$ stack : chr [1:3] "setosa" "versicolor" "virginica"
We can then combine all these steps using pipes:
iris %>% group_by(Species) %>%
mutate(id=1:n()) %>%
gather(key=columns,value=array_cell_value,-Species,-id) %>%
create_array(row_var = id,
col_var_name = columns,
col_var_value = array_cell_value,
third_dim_var = Species) %>%
str
## num [1:50, 1:4, 1:3] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## - attr(*, "dimnames")=List of 3
## ..$ row.names: chr [1:50] "1" "2" "3" "4" ...
## ..$ colnames : chr [1:4] "Petal.Length" "Petal.Width" "Sepal.Length" "Sepal.Width"
## ..$ stack : chr [1:3] "setosa" "versicolor" "virginica"