Skip to content

Instantly share code, notes, and snippets.

@mrdwab
Created July 16, 2020 06:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mrdwab/f78f96ea337fbc3053a462c17963148a to your computer and use it in GitHub Desktop.
Save mrdwab/f78f96ea337fbc3053a462c17963148a to your computer and use it in GitHub Desktop.
#' Calculate the Mean of Already Grouped Data
#'
#' Calculates the mean of already grouped data given the interval ranges and
#' the frequencies of each group.
#'
#' @param frequencies A vector of frequencies.
#' @param intervals A 2-column `matrix` with the same number of rows as
#' the length of frequencies, with the first column being the lower class
#' boundary, and the second column being the upper class boundary.
#' Alternatively, `intervals` may be a character vector, and you may
#' specify `sep` (and possibly, `trim` function automatically create the
#' required `matrix`.
#' @param sep Optional character that separates lower and uppper class
#' boundaries if `intervals` is entered as a character vector.
#' @param trim Optional leading or trailing characters to trim from the
#' character vector being used for `intervals`. There is an in-built pattern
#' in the grouped functions to trim the breakpoint labels created by [base::cut()].
#' If you are using a `grouped_*` function on the output of `cut` (where, for some
#' reason, you no longer have access to the original data), you can use
#' `trim = "cut"`.
#' @return A single numeric value representing the grouped mean, median, or
#' mode, depending on which function was called.
#'
#' @export grouped_mean
grouped_mean <- function(frequencies, intervals, sep = NULL, trim = NULL) {
intervals <- if (is.character(intervals)) .grp_intervals(intervals, sep, trim) else intervals
sum(rowMeans(intervals) * frequencies) / sum(frequencies)
}
#' @export grouped_mode
grouped_mode <- function(frequencies, intervals, sep = NULL, trim = NULL) {
intervals <- if (is.character(intervals)) .grp_intervals(intervals, sep, trim) else intervals
ind <- which.max(frequencies)
if (length(ind) > 1L) stop("Only for use where there are no ties for highest frequencies across groups.")
lw <- .grp_lw(intervals, ind)
fm0 <- if (ind == 1) 0 else frequencies[(ind-1)]
fm1 <- frequencies[ind]
fm2 <- if (ind == length(frequencies)) 0 else frequencies[(ind+1)]
lw[[1]] + ((fm1 - fm0) / (2*fm1 - fm0 - fm2)) * lw[[2]]
}
grouped_median <- function(frequencies, intervals, sep = NULL, trim = NULL) {
intervals <- if (is.character(intervals)) .grp_intervals(intervals, sep, trim) else intervals
cf <- cumsum(frequencies)
ind <- findInterval(max(cf)/2, cf) + 1
lw <- .grp_lw(intervals, ind)
f <- frequencies[ind]
cf <- cf[(ind - 1)]
n <- sum(frequencies)
lw[[1]] + (n/2 - cf)/f * lw[[2]]
}
.grp_intervals <- function(intervals, sep, trim) {
if (!is.null(sep)) {
if (is.null(trim)) pattern <- ""
else if (trim == "cut") pattern <- "\\[|\\]|\\(|\\)"
else pattern <- trim
matrix(
as.numeric(unlist(strsplit(gsub(pattern, "", intervals), sep), use.names = FALSE)),
ncol = 2, byrow = TRUE)
}
}
.grp_lw <- function(intervals, ind) {
if (ind == 1) {
L <- intervals[ind, 1]
w <- abs(diff(intervals[ind, ]))
} else {
if (intervals[ind, 1] == intervals[(ind-1), 2]) {
L <- intervals[ind, 1]
w <- abs(diff(intervals[ind, ]))
} else {
L <- mean(c(intervals[ind, 1], intervals[(ind-1), 2]))
x <- abs(intervals[ind, 1] - L)
w <- abs((intervals[ind, 2] + x) - L)
}
}
list(L, w)
}
@mrdwab
Copy link
Author

mrdwab commented Jul 16, 2020

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment