Skip to content

Instantly share code, notes, and snippets.

@tslumley
Last active August 31, 2023 10:28
Show Gist options
  • Save tslumley/16bdc33a5ccfc05d0b8e5d690cb397fd to your computer and use it in GitHub Desktop.
Save tslumley/16bdc33a5ccfc05d0b8e5d690cb397fd to your computer and use it in GitHub Desktop.
Modify sample.int with new options for PPS with replacement sampling that have the specified inclusion probabilities
## New sample.int
sample_int<-function(n, size=NULL, replace=FALSE, prob=NULL,
useHash=(n > 1e+07 && !replace && is.null(prob) && (!is.null(size)) && size <= n/2),
method=c("sequential","marginal","poisson")){
if (replace || is.null(prob)){
if (is.null(size)){
size<-n
}
} else{
if (is.null(size))
size<-sum(prob)
}
stopifnot(length(n) == 1L)
if (useHash) {
stopifnot(is.null(prob), !replace)
.Internal(sample2(n, size))
}
else if (!is.null(prob) && !replace){
if (length(prob)!=n) stop("incorrect number of probabilities")
method<-match.arg(method)
switch(method,
sequential=sample.int(n, size=size, replace=replace,prob=prob, useHash=FALSE),
marginal=sample_pps(n, size, prob),
poisson=sample(seq.int(1,n)[runif(n)<=prob*size/sum(prob)]))
} else {
## Will be the .Internal from sample.int
sample.int(n, size=size ,replace=replace, prob=prob, useHash=FALSE)
}
}
sample_pps<-function(n, size, prob){
eps<-1e-5
s<-sum(prob)
sums_to_one <- abs((s-1)/(s+eps))<eps
sums_to_int <- abs((s-round(s))/(s+eps))<eps
if (is.null(size)){
if(!sums_to_int) stop("sum(prob) must be an integer")
size<-round(s)
} else{
size_is_sum = abs((size-sum(prob))/(size+eps))<eps
size_is_int = abs((size-round(size))/(size+eps))<eps
if (!size_is_int) stop("size must be NULL or an integer")
if (sums_to_one && !size_is_sum){
warning("rescaling prob, which changes inclusion probabilities")
prob<-inclusion_probabilities(prob*size, size)
} else if (sums_to_int && !size_is_sum){
warning("sum(prob) is not equal to size or 1, rescaling")
prob<-inclusion_probabilities(prob*size/s,size)
}
rval<-seq_len(n)[sampling::UPtille(prob)!=0] ##sample_brewer(prob)
if (size>1)
sample(rval)
else
rval
}
}
## taken from sampling::inclusionprobabilities by Alina Matei and Yves Tille (GPL>2)
inclusion_probabilities<-function (a, n)
{
if (!is.vector(a))
a = as.vector(a)
nnull = length(a[a == 0])
nneg = length(a[a < 0])
if (nnull > 0)
warning("there are zero values in the initial vector a\n")
if (nneg > 0) {
warning("there are ", nneg, " negative value(s) shifted to zero\n")
a[(a < 0)] = 0
}
if (identical(a, rep(0, length(a))))
pik1 = a
else {
pik1 = n * a/sum(a)
pik = pik1[pik1 > 0]
list1 = pik1 > 0
list = pik >= 1
l = length(list[list == TRUE])
if (l > 0) {
l1 = 0
while (l != l1) {
x = pik[!list]
x = x/sum(x)
pik[!list] = (n - l) * x
pik[list] = 1
l1 = l
list = (pik >= 1)
l = length(list[list == TRUE])
}
pik1[list1] = pik
}
}
pik1
}
% File src/library/base/man/sample.Rd
% Part of the R package, https://www.R-project.org
% Copyright 1995-2023 R Core Team
% Distributed under GPL 2 or later
\name{sample}
\alias{sample}
\alias{sample.int}
\title{Random Samples and Permutations}
\description{
\code{sample} takes a sample of the specified size from the elements
of \code{x} using either with or without replacement.
}
\usage{
sample(x, size, replace = FALSE, prob = NULL)
sample.int(n, size = NULL, replace = FALSE, prob = NULL,
useHash = (n > 1e+07 && !replace && is.null(prob) && size <= n/2))
}
\arguments{
\item{x}{either a vector of one or more elements from which to choose,
or a positive integer. See \sQuote{Details.}}
\item{n}{a positive number, the number of items to choose from. See
\sQuote{Details.}}
\item{size}{a non-negative integer giving the number of items to
choose. The default is \code{n} except for
unequal-probability sampling without replacement, where it is
\code{sum(prob)}}
\item{replace}{should sampling be with replacement?}
\item{prob}{a vector of probability weights for obtaining the elements
of the vector being sampled.}
\item{useHash}{\code{\link{logical}} indicating if the hash-version of
the algorithm should be used. Can only be used for \code{replace =
FALSE}, \code{prob = NULL}, and \code{size <= n/2}, and really
should be used for large \code{n}, as \code{useHash=FALSE} will use
memory proportional to \code{n}.}
\item{method}{
When \code{replace=FALSE} and \code{prob} is not \code{NULL}, the
method to use for sampling, see Details below.
}
}
\details{
If \code{x} has length 1, is numeric (in the sense of
\code{\link{is.numeric}}) and \code{x >= 1}, sampling \emph{via}
\code{sample} takes place from \code{1:x}. \emph{Note} that this
convenience feature may lead to undesired behaviour when \code{x} is
of varying length in calls such as \code{sample(x)}. See the examples.
Otherwise \code{x} can be any \R object for which \code{length} and
subsetting by integers make sense: S3 or S4 methods for these
operations will be dispatched as appropriate.
For \code{sample} the default for \code{size} is the number of items
inferred from the first argument, so that \code{sample(x)} generates a
random permutation of the elements of \code{x} (or \code{1:x}).
It is allowed to ask for \code{size = 0} samples with \code{n = 0} or
a length-zero \code{x}, but otherwise \code{n > 0} or positive
\code{length(x)} is required.
Non-integer positive numerical values of \code{n} or \code{x} will be
truncated to the next smallest integer, which has to be no larger than
\code{\link{.Machine}$integer.max}.
The optional \code{prob} argument can be used to give a vector of
weights for obtaining the elements of the vector being sampled. They
need not sum to one, but they should be non-negative and not all zero.
If \code{replace} is true, Walker's alias method (Ripley, 1987) is
used when there are more than 200 reasonably probable values: this
gives results incompatible with those from \R < 2.2.0.
If \code{replace} is false and \code{prob} is supplied there are three
options, controlled by \code{method}. The number of nonzero weights
must be at least \code{size} in this case and the weights should sum
to the desired sample size.
All three of the methods have disadvantages. The default,
compatible with \R< 4.4.0 is sequential sampling, that
is, the probability of choosing the next item is proportional to the
weights amongst the remaining items. Sequential sampling is fast,
\emph{but the probability of being sampled is not equal or
proportional to \code{prob}}. Using \code{"marginal"}
draws a sample so that the inclusion probabilities are the values of
\code{prob} and by default \code{size=sum(prob)}. It uses Algorithm
6.11 from Tille (2006). This is much slower for large \code{n}
than the other methods. Finally, \code{"poisson"} does Poisson
sampling, where the inclusion probabilities are the values of
\code{prob} but the \emph{sample size is random} with mean \code{size}
rather than being fixed at \code{size}; it is most useful when
\code{size} is large and the variability is thus relatively small.
\code{sample.int} is a bare interface in which both \code{n} and
\code{size} must be supplied as integers.
Argument \code{n} can be larger than the largest integer of
type \code{integer}, up to the largest representable integer in type
\code{double}. Only uniform sampling is supported. Two
random numbers are used to ensure uniform sampling of large integers.
}
\value{
For \code{sample} a vector of length \code{size} with elements
drawn from either \code{x} or from the integers \code{1:x}.
For \code{sample.int}, an integer vector of length \code{size} with
elements from \code{1:n}, or a double vector if
\eqn{n \ge 2^{31}}{n >= 2^31}.
}
\references{
Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988)
\emph{The New S Language}.
Wadsworth & Brooks/Cole.
Ripley, B. D. (1987) \emph{Stochastic Simulation}. Wiley.
Tille, Y (2006) \emph{Sampling Algorithms} Springer
}
\seealso{
\code{\link{RNGkind}(sample.kind = ..)} about random number generation,
notably the change of \code{sample()} results with \R version 3.6.0.
CRAN package \CRANpkg{sampling} for other methods of weighted sampling
without replacement.
}
\examples{
x <- 1:12
# a random permutation
sample(x)
# bootstrap resampling -- only if length(x) > 1 !
sample(x, replace = TRUE)
# 100 Bernoulli trials
sample(c(0,1), 100, replace = TRUE)
## More careful bootstrapping -- Consider this when using sample()
## programmatically (i.e., in your function or simulation)!
# sample()'s surprise -- example
x <- 1:10
sample(x[x > 8]) # length 2
sample(x[x > 9]) # oops -- length 10!
sample(x[x > 10]) # length 0
## safer version:
resample <- function(x, ...) x[sample.int(length(x), ...)]
resample(x[x > 8]) # length 2
resample(x[x > 9]) # length 1
resample(x[x > 10]) # length 0
## R 3.0.0 and later
sample.int(1e10, 12, replace = TRUE)
sample.int(1e10, 12) # not that there is much chance of duplicates
## R 4.4.0 and later
### Sequential sampling does not give the specified inclusion probability
mean(replicate(100, 1 \%in\% sample_int(6, size=3, prob=c(1,1/2,1/2,1/3,1/3,1/3),
replace=FALSE, method="sequential")))
### marginal sampling does give the specified inclusion probability
mean(replicate(100, 1 \%in\% sample_int(6, size=3, prob=c(1,1/2,1/2,1/3,1/3,1/3),
replace=FALSE, method="marginal")))
}
\keyword{distribution}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment