tslumley/new_sample_int.R

## new_sample_int.R
## New sample.int

sample_int<-function(n, size=NULL, replace=FALSE, prob=NULL,
                     useHash=(n >  1e+07 && !replace && is.null(prob) && (!is.null(size)) && size <= n/2),
                     method=c("sequential","marginal","poisson")){
    if (replace || is.null(prob)){
        if (is.null(size)){
            size<-n
        }
    } else{
        if (is.null(size))
            size<-sum(prob)
    }
    stopifnot(length(n) == 1L)
    if (useHash) {
        stopifnot(is.null(prob), !replace)
        .Internal(sample2(n, size))
    }
    else if (!is.null(prob) && !replace){
        if (length(prob)!=n) stop("incorrect number of probabilities")
        method<-match.arg(method)
        switch(method,
               sequential=sample.int(n, size=size, replace=replace,prob=prob, useHash=FALSE),
               marginal=sample_pps(n, size, prob),
               poisson=sample(seq.int(1,n)[runif(n)<=prob*size/sum(prob)]))
    } else {
        ## Will be the .Internal from sample.int
        sample.int(n, size=size ,replace=replace, prob=prob, useHash=FALSE)
    }
}


sample_pps<-function(n, size, prob){
    eps<-1e-5
    s<-sum(prob)

    sums_to_one <- abs((s-1)/(s+eps))<eps
    sums_to_int <- abs((s-round(s))/(s+eps))<eps

    if (is.null(size)){
        if(!sums_to_int) stop("sum(prob) must be an integer")
        size<-round(s)
    } else{
        size_is_sum = abs((size-sum(prob))/(size+eps))<eps
        size_is_int = abs((size-round(size))/(size+eps))<eps
        if (!size_is_int) stop("size must be NULL or an integer")

        if (sums_to_one && !size_is_sum){
            warning("rescaling prob, which changes inclusion probabilities")
            prob<-inclusion_probabilities(prob*size, size)
        } else if (sums_to_int && !size_is_sum){
            warning("sum(prob) is not equal to size or 1, rescaling")
            prob<-inclusion_probabilities(prob*size/s,size)
        }

        rval<-seq_len(n)[sampling::UPtille(prob)!=0] ##sample_brewer(prob)
        if (size>1)
            sample(rval)
        else
            rval
    }

}

## taken from sampling::inclusionprobabilities by Alina Matei and Yves Tille (GPL>2)
inclusion_probabilities<-function (a, n)
{
    if (!is.vector(a))
        a = as.vector(a)
    nnull = length(a[a == 0])
    nneg = length(a[a < 0])
    if (nnull > 0)
        warning("there are zero values in the initial vector a\n")
    if (nneg > 0) {
        warning("there are ", nneg, " negative value(s) shifted to zero\n")
        a[(a < 0)] = 0
    }
    if (identical(a, rep(0, length(a))))
        pik1 = a
    else {
        pik1 = n * a/sum(a)
        pik = pik1[pik1 > 0]
        list1 = pik1 > 0
        list = pik >= 1
        l = length(list[list == TRUE])
        if (l > 0) {
            l1 = 0
            while (l != l1) {
                x = pik[!list]
                x = x/sum(x)
                pik[!list] = (n - l) * x
                pik[list] = 1
                l1 = l
                list = (pik >= 1)
                l = length(list[list == TRUE])
            }
            pik1[list1] = pik
        }
    }
    pik1
}

## sample.Rd
% File src/library/base/man/sample.Rd
% Part of the R package, https://www.R-project.org
% Copyright 1995-2023 R Core Team
% Distributed under GPL 2 or later

\name{sample}
\alias{sample}
\alias{sample.int}
\title{Random Samples and Permutations}
\description{
  \code{sample} takes a sample of the specified size from the elements
  of \code{x} using either with or without replacement.
}
\usage{
sample(x, size, replace = FALSE, prob = NULL)

sample.int(n, size = NULL, replace = FALSE, prob = NULL,
           useHash = (n > 1e+07 && !replace && is.null(prob) && size <= n/2))
}
\arguments{
  \item{x}{either a vector of one or more elements from which to choose,
    or a positive integer.  See \sQuote{Details.}}
  \item{n}{a positive number, the number of items to choose from.  See
    \sQuote{Details.}}
  \item{size}{a non-negative integer giving the number of items to
           choose. The default is \code{n} except for
           unequal-probability sampling without replacement, where it is
           \code{sum(prob)}}
  \item{replace}{should sampling be with replacement?}
  \item{prob}{a vector of probability weights for obtaining the elements
    of the vector being sampled.}
  \item{useHash}{\code{\link{logical}} indicating if the hash-version of
    the algorithm should be used.  Can only be used for \code{replace =
      FALSE}, \code{prob = NULL}, and \code{size <= n/2}, and really
    should be used for large \code{n}, as \code{useHash=FALSE} will use
    memory proportional to \code{n}.}
  \item{method}{
    When \code{replace=FALSE} and \code{prob} is not \code{NULL}, the
           method to use for sampling, see Details below.
    }
}
\details{
  If \code{x} has length 1, is numeric (in the sense of
  \code{\link{is.numeric}}) and \code{x >= 1}, sampling \emph{via}
  \code{sample} takes place from \code{1:x}.  \emph{Note} that this
  convenience feature may lead to undesired behaviour when \code{x} is
  of varying length in calls such as \code{sample(x)}.  See the examples.

  Otherwise \code{x} can be any \R object for which \code{length} and
  subsetting by integers make sense: S3 or S4 methods for these
  operations will be dispatched as appropriate.

  For \code{sample} the default for \code{size} is the number of items
  inferred from the first argument, so that \code{sample(x)} generates a
  random permutation of the elements of \code{x} (or \code{1:x}).

  It is allowed to ask for \code{size = 0} samples with \code{n = 0} or
  a length-zero \code{x}, but otherwise \code{n > 0} or positive
  \code{length(x)} is required.

  Non-integer positive numerical values of \code{n} or \code{x} will be
  truncated to the next smallest integer, which has to be no larger than
  \code{\link{.Machine}$integer.max}.

  The optional \code{prob} argument can be used to give a vector of
  weights for obtaining the elements of the vector being sampled.  They
  need not sum to one, but they should be non-negative and not all zero.
  If \code{replace} is true, Walker's alias method (Ripley, 1987) is
  used when there are more than 200 reasonably probable values: this
  gives results incompatible with those from \R < 2.2.0.

  If \code{replace} is false and \code{prob} is supplied there are three
  options, controlled by \code{method}.  The number of nonzero weights
  must be at least \code{size} in this case and the weights should sum
  to the desired sample size.

  All three of the methods have disadvantages.  The default,
  compatible with \R< 4.4.0 is sequential sampling, that
  is,  the probability of choosing the next item is proportional to the
  weights amongst the remaining items.  Sequential sampling is fast,
  \emph{but the probability of being sampled is not equal or
    proportional to \code{prob}}. Using \code{"marginal"}
  draws a sample so that the inclusion probabilities are the values of
  \code{prob} and by default \code{size=sum(prob)}. It uses Algorithm
  6.11 from Tille (2006). This is much slower for large \code{n}
  than the other methods.  Finally, \code{"poisson"} does Poisson
  sampling, where the inclusion probabilities are the values of
  \code{prob} but the \emph{sample size is random} with mean \code{size}
  rather than being fixed at \code{size}; it is most useful when
  \code{size} is large and the variability is thus relatively small.

  \code{sample.int} is a bare interface in which both \code{n} and
  \code{size} must be supplied as integers.

  Argument \code{n} can be larger than the largest integer of
  type \code{integer}, up to the largest representable integer in type
  \code{double}.  Only uniform sampling is supported.  Two
  random numbers are used to ensure uniform sampling of large integers.
}

\value{
  For \code{sample} a vector of length \code{size} with elements
  drawn from either \code{x} or from the integers \code{1:x}.

  For \code{sample.int}, an integer vector of length \code{size} with
  elements from \code{1:n}, or a double vector if
  \eqn{n \ge 2^{31}}{n >= 2^31}.
}

\references{
  Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988)
  \emph{The New S Language}.
  Wadsworth & Brooks/Cole.

  Ripley, B. D. (1987) \emph{Stochastic Simulation}. Wiley.

  Tille, Y (2006) \emph{Sampling Algorithms} Springer
}
\seealso{
  \code{\link{RNGkind}(sample.kind = ..)} about random number generation,
  notably the change of \code{sample()} results with \R version 3.6.0.

  CRAN package \CRANpkg{sampling} for other methods of weighted sampling
  without replacement.
}
\examples{
x <- 1:12
# a random permutation
sample(x)
# bootstrap resampling -- only if length(x) > 1 !
sample(x, replace = TRUE)

# 100 Bernoulli trials
sample(c(0,1), 100, replace = TRUE)

## More careful bootstrapping --  Consider this when using sample()
## programmatically (i.e., in your function or simulation)!

# sample()'s surprise -- example
x <- 1:10
    sample(x[x >  8]) # length 2
    sample(x[x >  9]) # oops -- length 10!
    sample(x[x > 10]) # length 0

## safer version:
resample <- function(x, ...) x[sample.int(length(x), ...)]
resample(x[x >  8]) # length 2
resample(x[x >  9]) # length 1
resample(x[x > 10]) # length 0

## R 3.0.0 and later
sample.int(1e10, 12, replace = TRUE)
sample.int(1e10, 12) # not that there is much chance of duplicates

## R 4.4.0 and later
### Sequential sampling does not give the specified inclusion probability
mean(replicate(100, 1 \%in\% sample_int(6, size=3, prob=c(1,1/2,1/2,1/3,1/3,1/3),
replace=FALSE, method="sequential")))
### marginal sampling does give the specified inclusion probability
mean(replicate(100, 1 \%in\% sample_int(6, size=3, prob=c(1,1/2,1/2,1/3,1/3,1/3),
replace=FALSE, method="marginal")))
}
\keyword{distribution}
	## New sample.int

	sample_int<-function(n, size=NULL, replace=FALSE, prob=NULL,
	useHash=(n > 1e+07 && !replace && is.null(prob) && (!is.null(size)) && size <= n/2),
	method=c("sequential","marginal","poisson")){
	if (replace \|\| is.null(prob)){
	if (is.null(size)){
	size<-n
	}
	} else{
	if (is.null(size))
	size<-sum(prob)
	}
	stopifnot(length(n) == 1L)
	if (useHash) {
	stopifnot(is.null(prob), !replace)
	.Internal(sample2(n, size))
	}
	else if (!is.null(prob) && !replace){
	if (length(prob)!=n) stop("incorrect number of probabilities")
	method<-match.arg(method)
	switch(method,
	sequential=sample.int(n, size=size, replace=replace,prob=prob, useHash=FALSE),
	marginal=sample_pps(n, size, prob),
	poisson=sample(seq.int(1,n)[runif(n)<=prob*size/sum(prob)]))
	} else {
	## Will be the .Internal from sample.int
	sample.int(n, size=size ,replace=replace, prob=prob, useHash=FALSE)
	}
	}


	sample_pps<-function(n, size, prob){
	eps<-1e-5
	s<-sum(prob)

	sums_to_one <- abs((s-1)/(s+eps))<eps
	sums_to_int <- abs((s-round(s))/(s+eps))<eps

	if (is.null(size)){
	if(!sums_to_int) stop("sum(prob) must be an integer")
	size<-round(s)
	} else{
	size_is_sum = abs((size-sum(prob))/(size+eps))<eps
	size_is_int = abs((size-round(size))/(size+eps))<eps
	if (!size_is_int) stop("size must be NULL or an integer")

	if (sums_to_one && !size_is_sum){
	warning("rescaling prob, which changes inclusion probabilities")
	prob<-inclusion_probabilities(prob*size, size)
	} else if (sums_to_int && !size_is_sum){
	warning("sum(prob) is not equal to size or 1, rescaling")
	prob<-inclusion_probabilities(prob*size/s,size)
	}

	rval<-seq_len(n)[sampling::UPtille(prob)!=0] ##sample_brewer(prob)
	if (size>1)
	sample(rval)
	else
	rval
	}

	}

	## taken from sampling::inclusionprobabilities by Alina Matei and Yves Tille (GPL>2)
	inclusion_probabilities<-function (a, n)
	{
	if (!is.vector(a))
	a = as.vector(a)
	nnull = length(a[a == 0])
	nneg = length(a[a < 0])
	if (nnull > 0)
	warning("there are zero values in the initial vector a\n")
	if (nneg > 0) {
	warning("there are ", nneg, " negative value(s) shifted to zero\n")
	a[(a < 0)] = 0
	}
	if (identical(a, rep(0, length(a))))
	pik1 = a
	else {
	pik1 = n * a/sum(a)
	pik = pik1[pik1 > 0]
	list1 = pik1 > 0
	list = pik >= 1
	l = length(list[list == TRUE])
	if (l > 0) {
	l1 = 0
	while (l != l1) {
	x = pik[!list]
	x = x/sum(x)
	pik[!list] = (n - l) * x
	pik[list] = 1
	l1 = l
	list = (pik >= 1)
	l = length(list[list == TRUE])
	}
	pik1[list1] = pik
	}
	}
	pik1
	}
	% File src/library/base/man/sample.Rd
	% Part of the R package, https://www.R-project.org
	% Copyright 1995-2023 R Core Team
	% Distributed under GPL 2 or later

	\name{sample}
	\alias{sample}
	\alias{sample.int}
	\title{Random Samples and Permutations}
	\description{
	\code{sample} takes a sample of the specified size from the elements
	of \code{x} using either with or without replacement.
	}
	\usage{
	sample(x, size, replace = FALSE, prob = NULL)

	sample.int(n, size = NULL, replace = FALSE, prob = NULL,
	useHash = (n > 1e+07 && !replace && is.null(prob) && size <= n/2))
	}
	\arguments{
	\item{x}{either a vector of one or more elements from which to choose,
	or a positive integer. See \sQuote{Details.}}
	\item{n}{a positive number, the number of items to choose from. See
	\sQuote{Details.}}
	\item{size}{a non-negative integer giving the number of items to
	choose. The default is \code{n} except for
	unequal-probability sampling without replacement, where it is
	\code{sum(prob)}}
	\item{replace}{should sampling be with replacement?}
	\item{prob}{a vector of probability weights for obtaining the elements
	of the vector being sampled.}
	\item{useHash}{\code{\link{logical}} indicating if the hash-version of
	the algorithm should be used. Can only be used for \code{replace =
	FALSE}, \code{prob = NULL}, and \code{size <= n/2}, and really
	should be used for large \code{n}, as \code{useHash=FALSE} will use
	memory proportional to \code{n}.}
	\item{method}{
	When \code{replace=FALSE} and \code{prob} is not \code{NULL}, the
	method to use for sampling, see Details below.
	}
	}
	\details{
	If \code{x} has length 1, is numeric (in the sense of
	\code{\link{is.numeric}}) and \code{x >= 1}, sampling \emph{via}
	\code{sample} takes place from \code{1:x}. \emph{Note} that this
	convenience feature may lead to undesired behaviour when \code{x} is
	of varying length in calls such as \code{sample(x)}. See the examples.

	Otherwise \code{x} can be any \R object for which \code{length} and
	subsetting by integers make sense: S3 or S4 methods for these
	operations will be dispatched as appropriate.

	For \code{sample} the default for \code{size} is the number of items
	inferred from the first argument, so that \code{sample(x)} generates a
	random permutation of the elements of \code{x} (or \code{1:x}).

	It is allowed to ask for \code{size = 0} samples with \code{n = 0} or
	a length-zero \code{x}, but otherwise \code{n > 0} or positive
	\code{length(x)} is required.

	Non-integer positive numerical values of \code{n} or \code{x} will be
	truncated to the next smallest integer, which has to be no larger than
	\code{\link{.Machine}$integer.max}.

	The optional \code{prob} argument can be used to give a vector of
	weights for obtaining the elements of the vector being sampled. They
	need not sum to one, but they should be non-negative and not all zero.
	If \code{replace} is true, Walker's alias method (Ripley, 1987) is
	used when there are more than 200 reasonably probable values: this
	gives results incompatible with those from \R < 2.2.0.

	If \code{replace} is false and \code{prob} is supplied there are three
	options, controlled by \code{method}. The number of nonzero weights
	must be at least \code{size} in this case and the weights should sum
	to the desired sample size.

	All three of the methods have disadvantages. The default,
	compatible with \R< 4.4.0 is sequential sampling, that
	is, the probability of choosing the next item is proportional to the
	weights amongst the remaining items. Sequential sampling is fast,
	\emph{but the probability of being sampled is not equal or
	proportional to \code{prob}}. Using \code{"marginal"}
	draws a sample so that the inclusion probabilities are the values of
	\code{prob} and by default \code{size=sum(prob)}. It uses Algorithm
	6.11 from Tille (2006). This is much slower for large \code{n}
	than the other methods. Finally, \code{"poisson"} does Poisson
	sampling, where the inclusion probabilities are the values of
	\code{prob} but the \emph{sample size is random} with mean \code{size}
	rather than being fixed at \code{size}; it is most useful when
	\code{size} is large and the variability is thus relatively small.

	\code{sample.int} is a bare interface in which both \code{n} and
	\code{size} must be supplied as integers.

	Argument \code{n} can be larger than the largest integer of
	type \code{integer}, up to the largest representable integer in type
	\code{double}. Only uniform sampling is supported. Two
	random numbers are used to ensure uniform sampling of large integers.
	}

	\value{
	For \code{sample} a vector of length \code{size} with elements
	drawn from either \code{x} or from the integers \code{1:x}.

	For \code{sample.int}, an integer vector of length \code{size} with
	elements from \code{1:n}, or a double vector if
	\eqn{n \ge 2^{31}}{n >= 2^31}.
	}

	\references{
	Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988)
	\emph{The New S Language}.
	Wadsworth & Brooks/Cole.

	Ripley, B. D. (1987) \emph{Stochastic Simulation}. Wiley.

	Tille, Y (2006) \emph{Sampling Algorithms} Springer
	}
	\seealso{
	\code{\link{RNGkind}(sample.kind = ..)} about random number generation,
	notably the change of \code{sample()} results with \R version 3.6.0.

	CRAN package \CRANpkg{sampling} for other methods of weighted sampling
	without replacement.
	}
	\examples{
	x <- 1:12
	# a random permutation
	sample(x)
	# bootstrap resampling -- only if length(x) > 1 !
	sample(x, replace = TRUE)

	# 100 Bernoulli trials
	sample(c(0,1), 100, replace = TRUE)

	## More careful bootstrapping -- Consider this when using sample()
	## programmatically (i.e., in your function or simulation)!

	# sample()'s surprise -- example
	x <- 1:10
	sample(x[x > 8]) # length 2
	sample(x[x > 9]) # oops -- length 10!
	sample(x[x > 10]) # length 0

	## safer version:
	resample <- function(x, ...) x[sample.int(length(x), ...)]
	resample(x[x > 8]) # length 2
	resample(x[x > 9]) # length 1
	resample(x[x > 10]) # length 0

	## R 3.0.0 and later
	sample.int(1e10, 12, replace = TRUE)
	sample.int(1e10, 12) # not that there is much chance of duplicates

	## R 4.4.0 and later
	### Sequential sampling does not give the specified inclusion probability
	mean(replicate(100, 1 \%in\% sample_int(6, size=3, prob=c(1,1/2,1/2,1/3,1/3,1/3),
	replace=FALSE, method="sequential")))
	### marginal sampling does give the specified inclusion probability
	mean(replicate(100, 1 \%in\% sample_int(6, size=3, prob=c(1,1/2,1/2,1/3,1/3,1/3),
	replace=FALSE, method="marginal")))
	}
	\keyword{distribution}