Last active
May 11, 2024 14:38
-
-
Save tluquez/a8b1bf03b686a51693bf8db9dd4a8401 to your computer and use it in GitHub Desktop.
get data frame of proportion of clusters per id and supercluster
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
summarize_by_group <- function(data, group, columns = NULL) { | |
#' Summarize data by group | |
#' | |
#' This function aggregates data by group and summarizes specified columns. | |
#' | |
#' @param data A data frame. | |
#' @param group A character vector specifying the grouping column(s). | |
#' @param columns Optional character vector specifying columns to summarize. | |
#' @return A data frame with summarized data. | |
#' @examples | |
#' mixed_df <- data.frame( | |
#' Group = rep(letters[1:3], each = 3), | |
#' Numeric_Value = c(1, 2, 3, 4, 5, 6, 7, 8, 9), | |
#' Factor_Value = factor(c("low", "medium", "high"), | |
#' levels = c("low", "medium", "high")), | |
#' Character_Value = c("apple", "banana", "apple", "banana", "apple", | |
#' "banana", "apple", "banana", "apple"), | |
#' Logical_Value = c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, | |
#' TRUE), | |
#' Date_Value = as.Date("2022-01-01") + 0:8, | |
#' Complex_Value = as.complex(1:9) | |
#' ) | |
#' | |
#' # Summarize all columns (default) by Group | |
#' summarize_by_group(mixed_df, "Group") | |
#' | |
#' # Summarize column Numeric_Value by Group | |
#' summarize_by_group(mixed_df, "Group", "Numeric_Value") | |
#' | |
#' # Summarize two columns by Group | |
#' summarize_by_group(mixed_df, "Group", c("Numeric_Value", "Factor_Value")) | |
#' | |
#' @export | |
# Validate input | |
if (!is.data.frame(data)) { | |
stop("data must be a dataframe") | |
} | |
if (!is.character(group) || length(group) == 0) { | |
stop("group must be a non-empty character vector") | |
} | |
if (!is.null(columns) && (!is.character(columns) || length(columns) == 0)) { | |
stop("columns must be a non-empty character vector") | |
} | |
# Get the columns to summarize | |
if (is.null(columns)) { | |
columns <- setdiff(names(data), group) | |
} | |
# If no columns to summarize, return unique combinations of group columns | |
if (length(columns) == 0) { | |
df <- dplyr::distinct(data, dplyr::across(tidyselect::all_of(group))) | |
} | |
# If columns need to be summarized, summarize columns by group | |
df <- data %>% | |
dplyr::group_by(dplyr::across(tidyselect::all_of(group))) %>% | |
dplyr::summarize( | |
dplyr::across( | |
dplyr::all_of(columns), | |
~ { | |
if (is.numeric(.)) mean(., na.rm = TRUE) | |
else if (is.factor(.)) levels(.)[1] | |
else if (is.character(.)) unique(.)[which.max(table(.))] | |
else if (is.logical(.)) unique(.)[which.max(table(.))] | |
else if (inherits(., c("Date", "POSIXt"))) as.character(unique(.))[1] | |
else if (is.list(.)) NA | |
else if (is.complex(.)) NA | |
else NA | |
} | |
), | |
.groups = "drop_last" | |
) %>% | |
dplyr::ungroup() | |
return(df) | |
} | |
get_props <- function(data, id, cluster, supercluster = NULL, | |
add_supercluster_prop = NULL, add_other_cols = T) { | |
#' Compute Proportions | |
#' | |
#' This function computes proportions per id across cluster. It can group by | |
#' supercluster and compute proportions per group. It can also add | |
#' supercluster proportions across all its levels. Finally, it summarizes | |
#' other columns of data per id. | |
#' | |
#' @param data Data frame containing \code{id} and \code{cluster}. | |
#' @param id The column name representing the observations identifier. | |
#' @param cluster The column name representing the cluster identifier. | |
#' @param supercluster Optional. The column name representing the supercluster | |
#' identifier. | |
#' @param add_supercluster_prop Logical indicating whether to | |
#' include proportions per id across superclusters. | |
#' @param add_other_cols Logical indicating whether to summarize other columns of \code{data} by \code{id}. Numeric columns are summarized to their mean, factor to the reference level and character to the mode. | |
#' | |
#' @return A long data frame with computed proportions per group and summary | |
#' statistics. | |
#' - \code{"num_cluster"}: The number of occurrences of each group combination. | |
#' - \code{"props"}: The proportions of each group combination within its | |
#' supercluster. | |
#' - \code{"supercluster"}: The supercluster associated with each group. | |
#' - \code{"num_supercluster"}: The total number of occurrences of each | |
#' \code{supercluster}-\code{id} combination. | |
#' | |
#' @details This function calculates proportions per group based on the counts of another | |
#' column, and summarizes other columns by group, calculating means for numeric columns, | |
#' reference levels for factor columns, and the mode for character columns. | |
#' | |
#' @examples | |
#' df <- data.frame(id = rep(1:3, each = 4), | |
#' cluster = rep(c("a", "b"), each = 2, times = 3), | |
#' supercluster = "A", | |
#' value1 = runif(12), | |
#' value2 = as.factor(LETTERS[1:12]), | |
#' value3 = LETTERS[1:12], | |
#' value4 = NA) | |
#' get_props(df, "id", "cluster", "supercluster", add_supercluster_prop = TRUE) | |
#' get_props(df, "id", "cluster", "supercluster") | |
#' get_props(df, "id", "cluster") | |
#' | |
#' @import dplyr | |
#' @import tidyr | |
#' @importFrom magrittr %>% | |
#' | |
#' @export | |
#' | |
if (!is.data.frame(data)) { | |
stop("Input 'data' must be a data frame.") | |
} | |
# Input Validation | |
required_cols <- c(id, cluster) | |
if (!is.null(supercluster)) { | |
required_cols <- c(required_cols, supercluster) | |
} | |
invalid_args <- c( | |
"id" = !is.character(id), | |
"cluster" = !is.character(cluster), | |
"supercluster" = !is.null(supercluster) && !is.character(supercluster) | |
) | |
invalid_args <- names(invalid_args)[invalid_args] | |
if (length(invalid_args) > 0) { | |
stop(paste("Argument(s) ", paste(invalid_args, collapse = ", "), | |
" must be character vector(s).")) | |
} | |
# Proportion Calculation | |
if (!is.null(supercluster)) { | |
# Proportions by supercluster x cluster x id | |
data_groups <- split(data, data[[supercluster]]) | |
props <- lapply(data_groups, function(data_group) { | |
# Proportion per supercluster by cluster x id | |
nums_cluster <- table(data_group[[id]], data_group[[cluster]]) | |
prop <- as.data.frame(prop.table(nums_cluster, margin = 1)) | |
nums_cluster <- as.data.frame(nums_cluster) | |
# Count observations per supercluster x id | |
nums_supercluster <- data.frame(table(data_group[[id]], | |
data_group[[supercluster]])) | |
# Left join and rename | |
prop <- merge(nums_cluster, prop, by = c("Var1", "Var2")) | |
prop <- merge(prop, nums_supercluster, by = "Var1") | |
colnames(prop) <- c(id, cluster, paste0("num_", cluster), "props", | |
supercluster, paste0("num_", supercluster)) | |
prop | |
}) | |
props <- do.call(rbind, props) | |
rownames(props) <- NULL | |
# Add proportions per supercluster column if requested | |
if (!is.null(add_supercluster_prop)) { | |
# Proportion by supercluster x id | |
nums_supercluster <- table(data[[id]], data[[supercluster]]) | |
prop <- as.data.frame(prop.table(nums_supercluster, margin = 1)) | |
nums_supercluster <- as.data.frame(nums_supercluster) | |
# Count observations per id across superclusters | |
nums_id <- data.frame(table(data[[id]])) | |
nums_id$Var2 <- supercluster | |
nums_id <- nums_id[, c("Var1", "Var2", "Freq")] | |
# Left join and rename | |
prop <- merge(nums_supercluster, prop, by = c("Var1", "Var2")) | |
prop <- merge(prop, nums_id, by = "Var1") | |
colnames(prop) <- c(id, cluster, paste0("num_", cluster), "props", | |
supercluster, paste0("num_", supercluster)) | |
props <- rbind(props, prop) | |
} | |
} else { | |
# Proportion by cluster x id | |
nums_cluster <- table(data[[id]], data[[cluster]]) | |
prop <- as.data.frame(prop.table(nums_cluster, margin = 1)) | |
nums_cluster <- as.data.frame(nums_cluster) | |
# Left join and rename | |
props <- merge(nums_cluster, prop, by = c("Var1", "Var2")) | |
colnames(props) <- c(id, cluster, paste0("num_", cluster), "props") | |
props | |
} | |
# Ensure column types conform to data | |
props[[id]] <- as(props[[id]], class(data[[id]])) | |
props[[cluster]] <- as(props[[cluster]], class(data[[cluster]])) | |
if (!is.null(supercluster)) { | |
props[[supercluster]] <- as(props[[supercluster]], | |
class(data[[supercluster]])) | |
} | |
# Summarize other columns by id | |
other_columns <- setdiff(names(data), required_cols) | |
if (add_other_cols && length(other_columns) > 0) { | |
df <- summarize_by_group(data, id, other_columns) | |
props <- dplyr::left_join(props, df, by = id) | |
} | |
return(props) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment