Last active
March 28, 2021 22:59
-
-
Save erikerhardt/afd8995346e4316d9a6687dfbd07e93e to your computer and use it in GitHub Desktop.
R Key variables completion, Efficient completion of keys from multiple data sources
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# key_variables_completion.Rmd at https://statacumen.com/2021/03/28/r-key-variables-completion/ | |
library(tidyverse) | |
# Combine rows with different data values | |
f_coalesce_by_column <- | |
function( | |
dat | |
) { | |
# for matching group_by() keys, keeps first non-NA value for each column | |
# combine rows in data frame containing NA to make complete row | |
# https://stackoverflow.com/questions/45515218/combine-rows-in-data-frame-containing-na-to-make-complete-row | |
## use: | |
## dat %>% | |
## group_by(A) %>% | |
## summarise_all(f_coalesce_by_column) | |
## TESTING | |
# dat <- | |
# # A = 1, easy | |
# # A = 2, easy | |
# # A = 3, 3rd row has a dup and NA | |
# # A = 4, 3rd row has conflucing info | |
# tribble( | |
# ~A, ~B, ~C, ~D, ~E | |
# , 1, NA, 3, NA, 5 | |
# , 1, 2, NA, 2, NA | |
# , 2, NA, NA, 3, NA | |
# , 2, 4, 5, NA, 4 | |
# , 3, NA, NA, 3, NA | |
# , 3, 4, 5, NA, 4 | |
# , 3, 4, NA, NA, 4 | |
# , 4, NA, NA, 3, NA | |
# , 4, 4, 5, NA, 4 | |
# , 4, 99, 99, 99, NA | |
# ) | |
dplyr::coalesce(!!! as.list(dat)) %>% | |
return() | |
} | |
# For a set of key variables that are unique for participants, | |
# identify sets of rows with unique key variables in common | |
# and collapse them together to have a single "most complete" row | |
# of key values by removing the NAs. | |
# Each key variable must have a unique value for each participant. | |
f_coalesce_column_set <- | |
function( | |
dat | |
) { | |
names_var <- | |
names(dat) | |
for (i_key in seq_along(names_var)) { | |
## i_key = 1 | |
# rows for current column without NAs | |
dat_rows_process <- | |
dat[!is.na(dat[[i_key]]), ] | |
# bind to end to process later | |
dat_rows_skip_NA <- | |
dat[is.na(dat[[i_key]]), ] | |
# coalesce for this key column | |
if (nrow(dat_rows_process) > 0) { | |
dat_rows_process <- | |
dat_rows_process %>% | |
group_by_at( | |
i_key | |
) %>% | |
summarise_all( | |
f_coalesce_by_column | |
) %>% | |
ungroup() | |
} | |
# combine, assure original column order for iteration | |
dat <- | |
bind_rows( | |
dat_rows_process | |
, dat_rows_skip_NA | |
) %>% | |
select( | |
one_of(names_var) | |
) | |
} | |
return(dat) | |
} | |
# match and replace less complete rows with most complete row | |
f_replace_keys_less_with_most_complete <- | |
function( | |
dat_data | |
, dat_most_complete | |
, col_keys = c("a", "b", "c") | |
) { | |
# combination to match key variables, from most to least | |
dat_combination_var <- | |
expand.grid( | |
replicate( | |
length(col_keys) | |
, 0:1 | |
, simplify = FALSE) | |
) | |
dat_combination_var$s <- | |
rowSums(dat_combination_var) | |
dat_combination_var <- | |
dat_combination_var %>% | |
arrange( | |
-s | |
) %>% | |
select( | |
-s | |
) | |
colnames(dat_combination_var) <- col_keys | |
# dat_combination_var | |
# list to fill with matches, then bind at end | |
dat_data_out <- list() | |
dat_data_in <- list() | |
for (i_combination in 1:nrow(dat_combination_var)) { | |
## i_combination = 1 | |
## i_combination = 2 | |
# keep track of which rows to include and exclude for this combination | |
ind_all <- matrix(NA, nrow = nrow(dat_data), ncol = length(col_keys)) | |
keys_include <- (dat_combination_var[i_combination, ] == 1) | |
for (i_col_keys in seq_along(col_keys)) { | |
## i_col_keys = 1 | |
if (keys_include[i_col_keys]) { | |
# values for these keys | |
ind_all[, i_col_keys] <- !is.na(dat_data[[ col_keys[i_col_keys] ]]) | |
} | |
if (!keys_include[i_col_keys]) { | |
# NAs for these col_keys | |
ind_all[, i_col_keys] <- is.na(dat_data[[ col_keys[i_col_keys] ]]) | |
} | |
} | |
# identify rows for this combination | |
ind_combination <- rowSums(ind_all) == length(col_keys) | |
dat_data_in[[i_combination]] <- | |
dat_data[ind_combination,] | |
if(sum(keys_include) == 0) { | |
dat_data_out[[i_combination]] <- | |
dat_data_in[[i_combination]] | |
} else { | |
dat_data_out[[i_combination]] <- | |
right_join( | |
dat_most_complete | |
, dat_data_in[[i_combination]] | |
, by = col_keys[which(keys_include)] | |
, suffix = c("", ".EMPTY") | |
, keep = FALSE | |
) %>% | |
select( | |
-ends_with(".EMPTY") | |
) | |
} | |
} | |
# combine all results | |
dat_data_updated <- | |
bind_rows( | |
dat_data_out | |
) | |
return(dat_data_updated) | |
} | |
#' Complete multiple key ID variables | |
#' | |
#' When data are combined from multiple sources and multiple identifier keys (ID) | |
#' are used, sometimes in combinations, then we may want to provide the | |
#' most complete set of the IDs for each observation. | |
#' By definition, each key value is unique within its column to identify an observational unit. | |
#' | |
#' @param dat_data data frame with a set of variables that are keys | |
#' @param dat_keys optional data frame with only key variables, more complete than set in dat_data | |
#' @param col_keys key columns names | |
#' | |
#' @return dat_data with updated key columns | |
#' @export | |
#' | |
#' @examples | |
#' | |
#' dat_data <- | |
#' tribble( | |
#' ~a, ~b, ~c, ~d1, ~d2 | |
#' , 1, 11, NA, 1, 20 | |
#' , 1, NA, 10, 2, 21 | |
#' , 2, NA, NA, 3, 22 | |
#' , NA, 22, 20, 4, 23 | |
#' , 2, NA, 20, 5, 24 | |
#' , 3, 33, NA, 6, 25 | |
#' , 4, NA, 40, 7, 26 | |
#' , 5, NA, NA, 8, 27 | |
#' , 6, NA, 60, 9, 28 | |
#' , 6, NA, 60, 10, 29 | |
#' , NA, 77, NA, 11, 30 | |
#' , NA, 88, 80, 12, 31 | |
#' , NA, 88, NA, 13, 32 | |
#' , NA, NA, NA, 14, 33 | |
#' ) | |
#' | |
#' dat_data_updated <- | |
#' f_complete_multiple_keys( | |
#' dat_data | |
#' , dat_keys = NULL | |
#' , col_keys = c("a", "b", "c") | |
#' ) | |
#' | |
#' dat_data_updated %>% print(n=Inf) | |
#' | |
f_complete_multiple_keys <- | |
function( | |
dat_data | |
, dat_keys = NULL | |
, col_keys = c("a", "b", "c") | |
) { | |
# create ID to restore original row order at end | |
dat_data <- | |
dat_data %>% | |
mutate( | |
.ID. = 1:n() | |
) | |
# create complete keys | |
if(is.null(dat_keys)) { | |
dat_keys <- | |
f_coalesce_column_set( | |
dat_data %>% | |
select( | |
one_of(col_keys) | |
) | |
) | |
} | |
# update data with complete keys | |
dat_data_updated <- | |
f_replace_keys_less_with_most_complete( | |
dat_data = dat_data | |
, dat_most_complete = dat_keys | |
, col_keys = col_keys | |
) %>% | |
# restore original row order | |
arrange( | |
.ID. | |
) %>% | |
select( | |
-.ID. | |
) | |
return(dat_data_updated) | |
} | |
## TESTING | |
dat_data <- | |
tribble( | |
~a, ~b, ~c, ~x, ~d1, ~d2 | |
, 1, 11, NA, NA, 1, 20 | |
, 1, NA, 10, NA, 2, 21 | |
, 2, NA, NA, NA, 3, 22 | |
, NA, 22, 20, NA, 4, 23 | |
, 2, NA, 20, NA, 5, 24 | |
, 3, 33, NA, NA, 6, 25 | |
, 4, NA, 40, NA, 7, 26 | |
, 5, NA, NA, NA, 8, 27 | |
, 6, NA, 60, NA, 9, 28 | |
, 6, NA, 60, NA, 10, 29 | |
, NA, 77, NA, NA, 11, 30 | |
, NA, 88, 80, NA, 12, 31 | |
, NA, 88, NA, NA, 13, 32 | |
, NA, NA, NA, NA, 14, 33 | |
) | |
dat_data_updated <- | |
f_complete_multiple_keys( | |
dat_data | |
, dat_keys = NULL | |
, col_keys = c("a", "b", "c", "x") | |
) | |
dat_data_updated %>% print(n=Inf) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment