Skip to content

Instantly share code, notes, and snippets.

@tluquez
Last active May 11, 2024 14:38
Show Gist options
  • Save tluquez/a8b1bf03b686a51693bf8db9dd4a8401 to your computer and use it in GitHub Desktop.
Save tluquez/a8b1bf03b686a51693bf8db9dd4a8401 to your computer and use it in GitHub Desktop.
get data frame of proportion of clusters per id and supercluster
summarize_by_group <- function(data, group, columns = NULL) {
#' Summarize data by group
#'
#' This function aggregates data by group and summarizes specified columns.
#'
#' @param data A data frame.
#' @param group A character vector specifying the grouping column(s).
#' @param columns Optional character vector specifying columns to summarize.
#' @return A data frame with summarized data.
#' @examples
#' mixed_df <- data.frame(
#' Group = rep(letters[1:3], each = 3),
#' Numeric_Value = c(1, 2, 3, 4, 5, 6, 7, 8, 9),
#' Factor_Value = factor(c("low", "medium", "high"),
#' levels = c("low", "medium", "high")),
#' Character_Value = c("apple", "banana", "apple", "banana", "apple",
#' "banana", "apple", "banana", "apple"),
#' Logical_Value = c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE,
#' TRUE),
#' Date_Value = as.Date("2022-01-01") + 0:8,
#' Complex_Value = as.complex(1:9)
#' )
#'
#' # Summarize all columns (default) by Group
#' summarize_by_group(mixed_df, "Group")
#'
#' # Summarize column Numeric_Value by Group
#' summarize_by_group(mixed_df, "Group", "Numeric_Value")
#'
#' # Summarize two columns by Group
#' summarize_by_group(mixed_df, "Group", c("Numeric_Value", "Factor_Value"))
#'
#' @export
# Validate input
if (!is.data.frame(data)) {
stop("data must be a dataframe")
}
if (!is.character(group) || length(group) == 0) {
stop("group must be a non-empty character vector")
}
if (!is.null(columns) && (!is.character(columns) || length(columns) == 0)) {
stop("columns must be a non-empty character vector")
}
# Get the columns to summarize
if (is.null(columns)) {
columns <- setdiff(names(data), group)
}
# If no columns to summarize, return unique combinations of group columns
if (length(columns) == 0) {
df <- dplyr::distinct(data, dplyr::across(tidyselect::all_of(group)))
}
# If columns need to be summarized, summarize columns by group
df <- data %>%
dplyr::group_by(dplyr::across(tidyselect::all_of(group))) %>%
dplyr::summarize(
dplyr::across(
dplyr::all_of(columns),
~ {
if (is.numeric(.)) mean(., na.rm = TRUE)
else if (is.factor(.)) levels(.)[1]
else if (is.character(.)) unique(.)[which.max(table(.))]
else if (is.logical(.)) unique(.)[which.max(table(.))]
else if (inherits(., c("Date", "POSIXt"))) as.character(unique(.))[1]
else if (is.list(.)) NA
else if (is.complex(.)) NA
else NA
}
),
.groups = "drop_last"
) %>%
dplyr::ungroup()
return(df)
}
get_props <- function(data, id, cluster, supercluster = NULL,
add_supercluster_prop = NULL, add_other_cols = T) {
#' Compute Proportions
#'
#' This function computes proportions per id across cluster. It can group by
#' supercluster and compute proportions per group. It can also add
#' supercluster proportions across all its levels. Finally, it summarizes
#' other columns of data per id.
#'
#' @param data Data frame containing \code{id} and \code{cluster}.
#' @param id The column name representing the observations identifier.
#' @param cluster The column name representing the cluster identifier.
#' @param supercluster Optional. The column name representing the supercluster
#' identifier.
#' @param add_supercluster_prop Logical indicating whether to
#' include proportions per id across superclusters.
#' @param add_other_cols Logical indicating whether to summarize other columns of \code{data} by \code{id}. Numeric columns are summarized to their mean, factor to the reference level and character to the mode.
#'
#' @return A long data frame with computed proportions per group and summary
#' statistics.
#' - \code{"num_cluster"}: The number of occurrences of each group combination.
#' - \code{"props"}: The proportions of each group combination within its
#' supercluster.
#' - \code{"supercluster"}: The supercluster associated with each group.
#' - \code{"num_supercluster"}: The total number of occurrences of each
#' \code{supercluster}-\code{id} combination.
#'
#' @details This function calculates proportions per group based on the counts of another
#' column, and summarizes other columns by group, calculating means for numeric columns,
#' reference levels for factor columns, and the mode for character columns.
#'
#' @examples
#' df <- data.frame(id = rep(1:3, each = 4),
#' cluster = rep(c("a", "b"), each = 2, times = 3),
#' supercluster = "A",
#' value1 = runif(12),
#' value2 = as.factor(LETTERS[1:12]),
#' value3 = LETTERS[1:12],
#' value4 = NA)
#' get_props(df, "id", "cluster", "supercluster", add_supercluster_prop = TRUE)
#' get_props(df, "id", "cluster", "supercluster")
#' get_props(df, "id", "cluster")
#'
#' @import dplyr
#' @import tidyr
#' @importFrom magrittr %>%
#'
#' @export
#'
if (!is.data.frame(data)) {
stop("Input 'data' must be a data frame.")
}
# Input Validation
required_cols <- c(id, cluster)
if (!is.null(supercluster)) {
required_cols <- c(required_cols, supercluster)
}
invalid_args <- c(
"id" = !is.character(id),
"cluster" = !is.character(cluster),
"supercluster" = !is.null(supercluster) && !is.character(supercluster)
)
invalid_args <- names(invalid_args)[invalid_args]
if (length(invalid_args) > 0) {
stop(paste("Argument(s) ", paste(invalid_args, collapse = ", "),
" must be character vector(s)."))
}
# Proportion Calculation
if (!is.null(supercluster)) {
# Proportions by supercluster x cluster x id
data_groups <- split(data, data[[supercluster]])
props <- lapply(data_groups, function(data_group) {
# Proportion per supercluster by cluster x id
nums_cluster <- table(data_group[[id]], data_group[[cluster]])
prop <- as.data.frame(prop.table(nums_cluster, margin = 1))
nums_cluster <- as.data.frame(nums_cluster)
# Count observations per supercluster x id
nums_supercluster <- data.frame(table(data_group[[id]],
data_group[[supercluster]]))
# Left join and rename
prop <- merge(nums_cluster, prop, by = c("Var1", "Var2"))
prop <- merge(prop, nums_supercluster, by = "Var1")
colnames(prop) <- c(id, cluster, paste0("num_", cluster), "props",
supercluster, paste0("num_", supercluster))
prop
})
props <- do.call(rbind, props)
rownames(props) <- NULL
# Add proportions per supercluster column if requested
if (!is.null(add_supercluster_prop)) {
# Proportion by supercluster x id
nums_supercluster <- table(data[[id]], data[[supercluster]])
prop <- as.data.frame(prop.table(nums_supercluster, margin = 1))
nums_supercluster <- as.data.frame(nums_supercluster)
# Count observations per id across superclusters
nums_id <- data.frame(table(data[[id]]))
nums_id$Var2 <- supercluster
nums_id <- nums_id[, c("Var1", "Var2", "Freq")]
# Left join and rename
prop <- merge(nums_supercluster, prop, by = c("Var1", "Var2"))
prop <- merge(prop, nums_id, by = "Var1")
colnames(prop) <- c(id, cluster, paste0("num_", cluster), "props",
supercluster, paste0("num_", supercluster))
props <- rbind(props, prop)
}
} else {
# Proportion by cluster x id
nums_cluster <- table(data[[id]], data[[cluster]])
prop <- as.data.frame(prop.table(nums_cluster, margin = 1))
nums_cluster <- as.data.frame(nums_cluster)
# Left join and rename
props <- merge(nums_cluster, prop, by = c("Var1", "Var2"))
colnames(props) <- c(id, cluster, paste0("num_", cluster), "props")
props
}
# Ensure column types conform to data
props[[id]] <- as(props[[id]], class(data[[id]]))
props[[cluster]] <- as(props[[cluster]], class(data[[cluster]]))
if (!is.null(supercluster)) {
props[[supercluster]] <- as(props[[supercluster]],
class(data[[supercluster]]))
}
# Summarize other columns by id
other_columns <- setdiff(names(data), required_cols)
if (add_other_cols && length(other_columns) > 0) {
df <- summarize_by_group(data, id, other_columns)
props <- dplyr::left_join(props, df, by = id)
}
return(props)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment