Skip to content

Instantly share code, notes, and snippets.

@gregorgorjanc
Last active October 28, 2023 20:21
Show Gist options
  • Save gregorgorjanc/a1f7bd19bfc021a090f0054ab77e8f37 to your computer and use it in GitHub Desktop.
Save gregorgorjanc/a1f7bd19bfc021a090f0054ab77e8f37 to your computer and use it in GitHub Desktop.
Sampling a uniform distribution from a non-uniform distribution
#' @rdname runif_from_nonunif
#' @title Sub-sampling a uniform distribution from a non-uniform distribution
#'
#' @description Sub-sampling a uniform distribution from a non-uniform distribution
#' by binning the input and sampling the input with weights inverse proportional to
#' bin sizes.
#'
#' @param x data.frame, with \code{id} and \code{value} columns whose rows will
#' be sub-sampled
#' @param n integer, number of samples
#' @param n_bins integer, number of bins
#'
#' @return Sub-sampled data.frame \code{x} with \code{id} and \code{value} columns
#'
#' @details
#'
#' @examples
#' # Exponential distribution - this will be hard to sample "uniformly"
#' n <- 10000
#' exp_samples <- data.frame(id = 1:n,
#' value = rexp(n = n))
#' unif_samples <- runif_from_nonunif(x = exp_samples, n = n / 10)
#' par(mfrow = c(2, 1))
#' tmp <- hist(exp_samples$value)
#' hist(unif_samples$value, breaks = tmp$breaks)
#'
#' # Beta(1, 2) - this should be doable
#' beta_samples <- data.frame(id = 1:n,
#' value = rbeta(n = n, shape1 = 1, shape2 = 2))
#' unif_samples <- runif_from_nonunif(x = beta_samples, n = n / 10)
#' par(mfrow = c(2, 1))
#' tmp <- hist(beta_samples$value)
#' hist(unif_samples$value, breaks = tmp$breaks)
#'
#' @export
runif_from_nonunif <- function(x, n, n_bins = 100) {
samples_min <- min(x$value)
samples_max <- max(x$value)
bin_size <- (samples_max - samples_min) / n_bins
bin_seq <- seq(from = samples_min, to = samples_max, by = bin_size)
x$bin <- cut(x = x$value, breaks = bin_seq)
bin_freq <- as.data.frame(table(x$bin))
colnames(bin_freq) <- c("bin", "freq")
x <- merge(x = x, y = bin_freq)
# Sample without replacement and up-weight low frequency values so that
# once we sample these out, we can then move to more common & high frequency
# values
# TODO: maybe this should be done differently/better by sampling bins at
# random and then randomly within a bin?
sel <- sample.int(n = nrow(x), size = n, prob = 1 / x$freq, replace = FALSE)
return(x[sel, c("id", "value")])
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment