Skip to content

Instantly share code, notes, and snippets.

@erikerhardt
Last active March 28, 2021 22:59
Show Gist options
  • Save erikerhardt/afd8995346e4316d9a6687dfbd07e93e to your computer and use it in GitHub Desktop.
Save erikerhardt/afd8995346e4316d9a6687dfbd07e93e to your computer and use it in GitHub Desktop.
R Key variables completion, Efficient completion of keys from multiple data sources
# key_variables_completion.Rmd at https://statacumen.com/2021/03/28/r-key-variables-completion/
library(tidyverse)
# Combine rows with different data values
f_coalesce_by_column <-
function(
dat
) {
# for matching group_by() keys, keeps first non-NA value for each column
# combine rows in data frame containing NA to make complete row
# https://stackoverflow.com/questions/45515218/combine-rows-in-data-frame-containing-na-to-make-complete-row
## use:
## dat %>%
## group_by(A) %>%
## summarise_all(f_coalesce_by_column)
## TESTING
# dat <-
# # A = 1, easy
# # A = 2, easy
# # A = 3, 3rd row has a dup and NA
# # A = 4, 3rd row has conflucing info
# tribble(
# ~A, ~B, ~C, ~D, ~E
# , 1, NA, 3, NA, 5
# , 1, 2, NA, 2, NA
# , 2, NA, NA, 3, NA
# , 2, 4, 5, NA, 4
# , 3, NA, NA, 3, NA
# , 3, 4, 5, NA, 4
# , 3, 4, NA, NA, 4
# , 4, NA, NA, 3, NA
# , 4, 4, 5, NA, 4
# , 4, 99, 99, 99, NA
# )
dplyr::coalesce(!!! as.list(dat)) %>%
return()
}
# For a set of key variables that are unique for participants,
# identify sets of rows with unique key variables in common
# and collapse them together to have a single "most complete" row
# of key values by removing the NAs.
# Each key variable must have a unique value for each participant.
f_coalesce_column_set <-
function(
dat
) {
names_var <-
names(dat)
for (i_key in seq_along(names_var)) {
## i_key = 1
# rows for current column without NAs
dat_rows_process <-
dat[!is.na(dat[[i_key]]), ]
# bind to end to process later
dat_rows_skip_NA <-
dat[is.na(dat[[i_key]]), ]
# coalesce for this key column
if (nrow(dat_rows_process) > 0) {
dat_rows_process <-
dat_rows_process %>%
group_by_at(
i_key
) %>%
summarise_all(
f_coalesce_by_column
) %>%
ungroup()
}
# combine, assure original column order for iteration
dat <-
bind_rows(
dat_rows_process
, dat_rows_skip_NA
) %>%
select(
one_of(names_var)
)
}
return(dat)
}
# match and replace less complete rows with most complete row
f_replace_keys_less_with_most_complete <-
function(
dat_data
, dat_most_complete
, col_keys = c("a", "b", "c")
) {
# combination to match key variables, from most to least
dat_combination_var <-
expand.grid(
replicate(
length(col_keys)
, 0:1
, simplify = FALSE)
)
dat_combination_var$s <-
rowSums(dat_combination_var)
dat_combination_var <-
dat_combination_var %>%
arrange(
-s
) %>%
select(
-s
)
colnames(dat_combination_var) <- col_keys
# dat_combination_var
# list to fill with matches, then bind at end
dat_data_out <- list()
dat_data_in <- list()
for (i_combination in 1:nrow(dat_combination_var)) {
## i_combination = 1
## i_combination = 2
# keep track of which rows to include and exclude for this combination
ind_all <- matrix(NA, nrow = nrow(dat_data), ncol = length(col_keys))
keys_include <- (dat_combination_var[i_combination, ] == 1)
for (i_col_keys in seq_along(col_keys)) {
## i_col_keys = 1
if (keys_include[i_col_keys]) {
# values for these keys
ind_all[, i_col_keys] <- !is.na(dat_data[[ col_keys[i_col_keys] ]])
}
if (!keys_include[i_col_keys]) {
# NAs for these col_keys
ind_all[, i_col_keys] <- is.na(dat_data[[ col_keys[i_col_keys] ]])
}
}
# identify rows for this combination
ind_combination <- rowSums(ind_all) == length(col_keys)
dat_data_in[[i_combination]] <-
dat_data[ind_combination,]
if(sum(keys_include) == 0) {
dat_data_out[[i_combination]] <-
dat_data_in[[i_combination]]
} else {
dat_data_out[[i_combination]] <-
right_join(
dat_most_complete
, dat_data_in[[i_combination]]
, by = col_keys[which(keys_include)]
, suffix = c("", ".EMPTY")
, keep = FALSE
) %>%
select(
-ends_with(".EMPTY")
)
}
}
# combine all results
dat_data_updated <-
bind_rows(
dat_data_out
)
return(dat_data_updated)
}
#' Complete multiple key ID variables
#'
#' When data are combined from multiple sources and multiple identifier keys (ID)
#' are used, sometimes in combinations, then we may want to provide the
#' most complete set of the IDs for each observation.
#' By definition, each key value is unique within its column to identify an observational unit.
#'
#' @param dat_data data frame with a set of variables that are keys
#' @param dat_keys optional data frame with only key variables, more complete than set in dat_data
#' @param col_keys key columns names
#'
#' @return dat_data with updated key columns
#' @export
#'
#' @examples
#'
#' dat_data <-
#' tribble(
#' ~a, ~b, ~c, ~d1, ~d2
#' , 1, 11, NA, 1, 20
#' , 1, NA, 10, 2, 21
#' , 2, NA, NA, 3, 22
#' , NA, 22, 20, 4, 23
#' , 2, NA, 20, 5, 24
#' , 3, 33, NA, 6, 25
#' , 4, NA, 40, 7, 26
#' , 5, NA, NA, 8, 27
#' , 6, NA, 60, 9, 28
#' , 6, NA, 60, 10, 29
#' , NA, 77, NA, 11, 30
#' , NA, 88, 80, 12, 31
#' , NA, 88, NA, 13, 32
#' , NA, NA, NA, 14, 33
#' )
#'
#' dat_data_updated <-
#' f_complete_multiple_keys(
#' dat_data
#' , dat_keys = NULL
#' , col_keys = c("a", "b", "c")
#' )
#'
#' dat_data_updated %>% print(n=Inf)
#'
f_complete_multiple_keys <-
function(
dat_data
, dat_keys = NULL
, col_keys = c("a", "b", "c")
) {
# create ID to restore original row order at end
dat_data <-
dat_data %>%
mutate(
.ID. = 1:n()
)
# create complete keys
if(is.null(dat_keys)) {
dat_keys <-
f_coalesce_column_set(
dat_data %>%
select(
one_of(col_keys)
)
)
}
# update data with complete keys
dat_data_updated <-
f_replace_keys_less_with_most_complete(
dat_data = dat_data
, dat_most_complete = dat_keys
, col_keys = col_keys
) %>%
# restore original row order
arrange(
.ID.
) %>%
select(
-.ID.
)
return(dat_data_updated)
}
## TESTING
dat_data <-
tribble(
~a, ~b, ~c, ~x, ~d1, ~d2
, 1, 11, NA, NA, 1, 20
, 1, NA, 10, NA, 2, 21
, 2, NA, NA, NA, 3, 22
, NA, 22, 20, NA, 4, 23
, 2, NA, 20, NA, 5, 24
, 3, 33, NA, NA, 6, 25
, 4, NA, 40, NA, 7, 26
, 5, NA, NA, NA, 8, 27
, 6, NA, 60, NA, 9, 28
, 6, NA, 60, NA, 10, 29
, NA, 77, NA, NA, 11, 30
, NA, 88, 80, NA, 12, 31
, NA, 88, NA, NA, 13, 32
, NA, NA, NA, NA, 14, 33
)
dat_data_updated <-
f_complete_multiple_keys(
dat_data
, dat_keys = NULL
, col_keys = c("a", "b", "c", "x")
)
dat_data_updated %>% print(n=Inf)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment