Skip to content

Instantly share code, notes, and snippets.

@sboysel
Last active Mar 4, 2016
Embed
What would you like to do?
Grouped Summary Statistics
#' Grouped summary statistics
#'
#' Wraps \code{\link{aggregate}} to generate common summary statistics over
#' subsets.
#'
#' @param data A data.frame to be summarized.
#' @param by Grouping variables specified by a formula as used in
#' \code{\link{aggregate}}.
#' @param stats A character vector of statistical functions. Default is c("n",
#' "mean", "sd", "min", "max"). More functions can be added by modifying the
#' summary_funs function.
#' @return A list of data.frames of length equal to the number of permutations
#' in the grouping variables. Permutations with no observations are silently
#' dropped. Each data.frame contains summary statistics over all variables in
#' data less the grouping variables. NA's are removed silently.
#'
#' @examples
#' sumstats(mtcars, . ~ cyl + vs)
#' sumstats(mtcars, cbind(mpg, disp, hp) ~ cyl + vs)
#'
#' ## Display only a subset of statistics
#' sumstats(mtcars, . ~ cyl + vs, stats = c("n", "mean", "sd"))
#'
#' @seealso \code{\link{aggregate}}
#'
#' @export
sumstats <- function(data, by,
stats = c("n", "mean", "sd", "min", "p25", "p50",
"p75", "max")) {
ss <- lapply(stats, function(x) summary_funs(data, fun = x, by))
names(ss) <- stats
parse_sumstats(ss, by = by)
}
is.formula <- function(x) {
identical(class(x), "formula")
}
n <- function(x) {
length(x[!is.na(x)])
}
summary_funs <- function(data, fun, by) {
if (is.formula(by)) {
switch(fun,
n = aggregate(formula = by, data = data, FUN = n,
na.action = na.pass),
mean = aggregate(formula = by, data = data, FUN = mean,
na.action = na.pass, na.rm = TRUE),
sd = aggregate(formula = by, data = data, FUN = sd,
na.action = na.pass, na.rm = TRUE),
min = aggregate(formula = by, data = data, FUN = min,
na.action = na.pass, na.rm = TRUE),
p25 = aggregate(formula = by, data = data, FUN = quantile,
na.action = na.pass, probs = 0.25, names = FALSE,
na.rm = TRUE),
p50 = aggregate(formula = by, data = data, FUN = quantile,
na.action = na.pass, probs = 0.5, names = FALSE,
na.rm = TRUE),
p75 = aggregate(formula = by, data = data, FUN = quantile,
na.action = na.pass, probs = 0.75, names = FALSE,
na.rm = TRUE),
max = aggregate(formula = by, data = data, FUN = max,
na.action = na.pass, na.rm = TRUE))
} else {
stop("'by' must be a formula.")
}
}
parse_sumstats <- function(ss, by) {
if (is.formula(by)) {
by <- attr(terms.formula(by), "term.labels")
}
stats <- names(ss)
parsed <- lapply(stats, function(x) {
chunk <- data.frame(t(ss[[x]]))
chunk.namerows <- chunk[rownames(chunk) %in% by, ]
chunk.data <- chunk[!rownames(chunk) %in% by, ]
names.temp <- apply(chunk.namerows, 2, function(x) paste(by, "=", x))
if (length(by) > 1) {
names(chunk.data) <- apply(names.temp, 2,
function(x) paste(x, collapse = ","))
} else {
names(chunk.data) <- names.temp
}
chunk.data
})
names(parsed) <- stats
parsed
final <- list()
for (stat in names(parsed)) {
frame <- parsed[[stat]]
for (group in names(frame)) {
col <- parsed[[stat]][group]
names(col) <- stat
final[[group]][[stat]] <- col
}
}
lapply(final, function(x) do.call(cbind, x))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment