gregorgorjanc/runif_from_nonunif.R

## runif_from_nonunif.R
#' @rdname runif_from_nonunif
#' @title Sub-sampling a uniform distribution from a non-uniform distribution
#'
#' @description Sub-sampling a uniform distribution from a non-uniform distribution
#'   by binning the input and sampling the input with weights inverse proportional to
#'   bin sizes.
#'
#' @param x data.frame, with \code{id} and \code{value} columns whose rows will
#'   be sub-sampled
#' @param n integer, number of samples
#' @param n_bins integer, number of bins
#'
#' @return Sub-sampled data.frame \code{x} with \code{id} and \code{value} columns
#'
#' @details
#'
#' @examples
#' # Exponential distribution - this will be hard to sample "uniformly"
#' n <- 10000
#' exp_samples <- data.frame(id = 1:n,
#'                           value = rexp(n = n))
#' unif_samples <- runif_from_nonunif(x = exp_samples, n = n / 10)
#' par(mfrow = c(2, 1))
#' tmp <- hist(exp_samples$value)
#' hist(unif_samples$value, breaks = tmp$breaks)
#'
#' # Beta(1, 2) - this should be doable
#' beta_samples <- data.frame(id = 1:n,
#'                            value = rbeta(n = n, shape1 = 1, shape2 = 2))
#' unif_samples <- runif_from_nonunif(x = beta_samples, n = n / 10)
#' par(mfrow = c(2, 1))
#' tmp <- hist(beta_samples$value)
#' hist(unif_samples$value, breaks = tmp$breaks)
#'
#' @export
runif_from_nonunif <- function(x, n, n_bins = 100) {
  samples_min <- min(x$value)
  samples_max <- max(x$value)
  bin_size <- (samples_max - samples_min) / n_bins
  bin_seq <- seq(from = samples_min, to = samples_max, by = bin_size)
  x$bin <- cut(x = x$value, breaks = bin_seq)
  bin_freq <- as.data.frame(table(x$bin))
  colnames(bin_freq) <- c("bin", "freq")
  x <- merge(x = x, y = bin_freq)
  # Sample without replacement and up-weight low frequency values so that
  # once we sample these out, we can then move to more common & high frequency
  # values
  # TODO: maybe this should be done differently/better by sampling bins at
  # random and then randomly within a bin?
  sel <- sample.int(n = nrow(x), size = n, prob = 1 / x$freq, replace = FALSE)
  return(x[sel, c("id", "value")])
}
	#' @rdname runif_from_nonunif
	#' @title Sub-sampling a uniform distribution from a non-uniform distribution
	#'
	#' @description Sub-sampling a uniform distribution from a non-uniform distribution
	#' by binning the input and sampling the input with weights inverse proportional to
	#' bin sizes.
	#'
	#' @param x data.frame, with \code{id} and \code{value} columns whose rows will
	#' be sub-sampled
	#' @param n integer, number of samples
	#' @param n_bins integer, number of bins
	#'
	#' @return Sub-sampled data.frame \code{x} with \code{id} and \code{value} columns
	#'
	#' @details
	#'
	#' @examples
	#' # Exponential distribution - this will be hard to sample "uniformly"
	#' n <- 10000
	#' exp_samples <- data.frame(id = 1:n,
	#' value = rexp(n = n))
	#' unif_samples <- runif_from_nonunif(x = exp_samples, n = n / 10)
	#' par(mfrow = c(2, 1))
	#' tmp <- hist(exp_samples$value)
	#' hist(unif_samples$value, breaks = tmp$breaks)
	#'
	#' # Beta(1, 2) - this should be doable
	#' beta_samples <- data.frame(id = 1:n,
	#' value = rbeta(n = n, shape1 = 1, shape2 = 2))
	#' unif_samples <- runif_from_nonunif(x = beta_samples, n = n / 10)
	#' par(mfrow = c(2, 1))
	#' tmp <- hist(beta_samples$value)
	#' hist(unif_samples$value, breaks = tmp$breaks)
	#'
	#' @export
	runif_from_nonunif <- function(x, n, n_bins = 100) {
	samples_min <- min(x$value)
	samples_max <- max(x$value)
	bin_size <- (samples_max - samples_min) / n_bins
	bin_seq <- seq(from = samples_min, to = samples_max, by = bin_size)
	x$bin <- cut(x = x$value, breaks = bin_seq)
	bin_freq <- as.data.frame(table(x$bin))
	colnames(bin_freq) <- c("bin", "freq")
	x <- merge(x = x, y = bin_freq)
	# Sample without replacement and up-weight low frequency values so that
	# once we sample these out, we can then move to more common & high frequency
	# values
	# TODO: maybe this should be done differently/better by sampling bins at
	# random and then randomly within a bin?
	sel <- sample.int(n = nrow(x), size = n, prob = 1 / x$freq, replace = FALSE)
	return(x[sel, c("id", "value")])
	}