Last active
October 28, 2023 20:21
-
-
Save gregorgorjanc/a1f7bd19bfc021a090f0054ab77e8f37 to your computer and use it in GitHub Desktop.
Sampling a uniform distribution from a non-uniform distribution
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' @rdname runif_from_nonunif | |
#' @title Sub-sampling a uniform distribution from a non-uniform distribution | |
#' | |
#' @description Sub-sampling a uniform distribution from a non-uniform distribution | |
#' by binning the input and sampling the input with weights inverse proportional to | |
#' bin sizes. | |
#' | |
#' @param x data.frame, with \code{id} and \code{value} columns whose rows will | |
#' be sub-sampled | |
#' @param n integer, number of samples | |
#' @param n_bins integer, number of bins | |
#' | |
#' @return Sub-sampled data.frame \code{x} with \code{id} and \code{value} columns | |
#' | |
#' @details | |
#' | |
#' @examples | |
#' # Exponential distribution - this will be hard to sample "uniformly" | |
#' n <- 10000 | |
#' exp_samples <- data.frame(id = 1:n, | |
#' value = rexp(n = n)) | |
#' unif_samples <- runif_from_nonunif(x = exp_samples, n = n / 10) | |
#' par(mfrow = c(2, 1)) | |
#' tmp <- hist(exp_samples$value) | |
#' hist(unif_samples$value, breaks = tmp$breaks) | |
#' | |
#' # Beta(1, 2) - this should be doable | |
#' beta_samples <- data.frame(id = 1:n, | |
#' value = rbeta(n = n, shape1 = 1, shape2 = 2)) | |
#' unif_samples <- runif_from_nonunif(x = beta_samples, n = n / 10) | |
#' par(mfrow = c(2, 1)) | |
#' tmp <- hist(beta_samples$value) | |
#' hist(unif_samples$value, breaks = tmp$breaks) | |
#' | |
#' @export | |
runif_from_nonunif <- function(x, n, n_bins = 100) { | |
samples_min <- min(x$value) | |
samples_max <- max(x$value) | |
bin_size <- (samples_max - samples_min) / n_bins | |
bin_seq <- seq(from = samples_min, to = samples_max, by = bin_size) | |
x$bin <- cut(x = x$value, breaks = bin_seq) | |
bin_freq <- as.data.frame(table(x$bin)) | |
colnames(bin_freq) <- c("bin", "freq") | |
x <- merge(x = x, y = bin_freq) | |
# Sample without replacement and up-weight low frequency values so that | |
# once we sample these out, we can then move to more common & high frequency | |
# values | |
# TODO: maybe this should be done differently/better by sampling bins at | |
# random and then randomly within a bin? | |
sel <- sample.int(n = nrow(x), size = n, prob = 1 / x$freq, replace = FALSE) | |
return(x[sel, c("id", "value")]) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment