mrdwab/grouped_functions.R

## grouped_functions.R
#' Calculate the Mean of Already Grouped Data
#'
#' Calculates the mean of already grouped data given the interval ranges and
#' the frequencies of each group.
#'
#' @param frequencies A vector of frequencies.
#' @param intervals A 2-column `matrix` with the same number of rows as
#' the length of frequencies, with the first column being the lower class
#' boundary, and the second column being the upper class boundary.
#' Alternatively, `intervals` may be a character vector, and you may
#' specify `sep` (and possibly, `trim` function automatically create the
#' required `matrix`.
#' @param sep Optional character that separates lower and uppper class
#' boundaries if `intervals` is entered as a character vector.
#' @param trim Optional leading or trailing characters to trim from the
#' character vector being used for `intervals`. There is an in-built pattern
#' in the grouped functions to trim the breakpoint labels created by [base::cut()].
#' If you are using a `grouped_*` function on the output of `cut` (where, for some
#' reason, you no longer have access to the original data), you can use
#' `trim = "cut"`.
#' @return A single numeric value representing the grouped mean, median, or
#' mode, depending on which function was called.
#'
#' @export grouped_mean
grouped_mean <- function(frequencies, intervals, sep = NULL, trim = NULL) {
  intervals <- if (is.character(intervals)) .grp_intervals(intervals, sep, trim) else intervals
  sum(rowMeans(intervals) * frequencies) / sum(frequencies)
}

#' @export grouped_mode
grouped_mode <- function(frequencies, intervals, sep = NULL, trim = NULL) {
  intervals <- if (is.character(intervals)) .grp_intervals(intervals, sep, trim) else intervals

  ind <- which.max(frequencies)
  if (length(ind) > 1L) stop("Only for use where there are no ties for highest frequencies across groups.")

  lw <- .grp_lw(intervals, ind)

  fm0 <- if (ind == 1) 0 else frequencies[(ind-1)]
  fm1 <- frequencies[ind]
  fm2 <- if (ind == length(frequencies)) 0 else frequencies[(ind+1)]

  lw[[1]] + ((fm1 - fm0) / (2*fm1 - fm0 - fm2)) * lw[[2]]
}

grouped_median <- function(frequencies, intervals, sep = NULL, trim = NULL) {
  intervals <- if (is.character(intervals)) .grp_intervals(intervals, sep, trim) else intervals

  cf <- cumsum(frequencies)
  ind <- findInterval(max(cf)/2, cf) + 1

  lw <- .grp_lw(intervals, ind)

  f <- frequencies[ind]
  cf <- cf[(ind - 1)]
  n <- sum(frequencies)

  lw[[1]] + (n/2 - cf)/f * lw[[2]]
}

.grp_intervals <- function(intervals, sep, trim) {
  if (!is.null(sep)) {
    if (is.null(trim)) pattern <- ""
    else if (trim == "cut") pattern <- "\\[|\\]|\\(|\\)"
    else pattern <- trim
    matrix(
      as.numeric(unlist(strsplit(gsub(pattern, "", intervals), sep), use.names = FALSE)),
      ncol = 2, byrow = TRUE)
  }
}

.grp_lw <- function(intervals, ind) {
  if (ind == 1) {
    L <- intervals[ind, 1]
    w <- abs(diff(intervals[ind, ]))
  } else {
    if (intervals[ind, 1] == intervals[(ind-1), 2]) {
      L <- intervals[ind, 1]
      w <- abs(diff(intervals[ind, ]))
    } else {
      L <- mean(c(intervals[ind, 1], intervals[(ind-1), 2]))
      x <- abs(intervals[ind, 1] - L)
      w <- abs((intervals[ind, 2] + x) - L)
    }
  }
  list(L, w)
}
	#' Calculate the Mean of Already Grouped Data
	#'
	#' Calculates the mean of already grouped data given the interval ranges and
	#' the frequencies of each group.
	#'
	#' @param frequencies A vector of frequencies.
	#' @param intervals A 2-column `matrix` with the same number of rows as
	#' the length of frequencies, with the first column being the lower class
	#' boundary, and the second column being the upper class boundary.
	#' Alternatively, `intervals` may be a character vector, and you may
	#' specify `sep` (and possibly, `trim` function automatically create the
	#' required `matrix`.
	#' @param sep Optional character that separates lower and uppper class
	#' boundaries if `intervals` is entered as a character vector.
	#' @param trim Optional leading or trailing characters to trim from the
	#' character vector being used for `intervals`. There is an in-built pattern
	#' in the grouped functions to trim the breakpoint labels created by [base::cut()].
	#' If you are using a `grouped_*` function on the output of `cut` (where, for some
	#' reason, you no longer have access to the original data), you can use
	#' `trim = "cut"`.
	#' @return A single numeric value representing the grouped mean, median, or
	#' mode, depending on which function was called.
	#'
	#' @export grouped_mean
	grouped_mean <- function(frequencies, intervals, sep = NULL, trim = NULL) {
	intervals <- if (is.character(intervals)) .grp_intervals(intervals, sep, trim) else intervals
	sum(rowMeans(intervals) * frequencies) / sum(frequencies)
	}

	#' @export grouped_mode
	grouped_mode <- function(frequencies, intervals, sep = NULL, trim = NULL) {
	intervals <- if (is.character(intervals)) .grp_intervals(intervals, sep, trim) else intervals

	ind <- which.max(frequencies)
	if (length(ind) > 1L) stop("Only for use where there are no ties for highest frequencies across groups.")

	lw <- .grp_lw(intervals, ind)

	fm0 <- if (ind == 1) 0 else frequencies[(ind-1)]
	fm1 <- frequencies[ind]
	fm2 <- if (ind == length(frequencies)) 0 else frequencies[(ind+1)]

	lw[[1]] + ((fm1 - fm0) / (2fm1 - fm0 - fm2)) lw[[2]]
	}

	grouped_median <- function(frequencies, intervals, sep = NULL, trim = NULL) {
	intervals <- if (is.character(intervals)) .grp_intervals(intervals, sep, trim) else intervals

	cf <- cumsum(frequencies)
	ind <- findInterval(max(cf)/2, cf) + 1

	lw <- .grp_lw(intervals, ind)

	f <- frequencies[ind]
	cf <- cf[(ind - 1)]
	n <- sum(frequencies)

	lw[[1]] + (n/2 - cf)/f * lw[[2]]
	}

	.grp_intervals <- function(intervals, sep, trim) {
	if (!is.null(sep)) {
	if (is.null(trim)) pattern <- ""
	else if (trim == "cut") pattern <- "\\[\|\\]\|\\(\|\\)"
	else pattern <- trim
	matrix(
	as.numeric(unlist(strsplit(gsub(pattern, "", intervals), sep), use.names = FALSE)),
	ncol = 2, byrow = TRUE)
	}
	}

	.grp_lw <- function(intervals, ind) {
	if (ind == 1) {
	L <- intervals[ind, 1]
	w <- abs(diff(intervals[ind, ]))
	} else {
	if (intervals[ind, 1] == intervals[(ind-1), 2]) {
	L <- intervals[ind, 1]
	w <- abs(diff(intervals[ind, ]))
	} else {
	L <- mean(c(intervals[ind, 1], intervals[(ind-1), 2]))
	x <- abs(intervals[ind, 1] - L)
	w <- abs((intervals[ind, 2] + x) - L)
	}
	}
	list(L, w)
	}