brodieG/sample01101.R

## sample01101.R
sample01101 <- function(size, n, complete.only=FALSE) {
  size <- as.integer(size)
  n <- as.integer(n)

  if(size > 25 || size < 3L) stop(
    "Currently size min is 3 and max is 25, though should be possible to allow ",
    "smaller and larger with some changes"
  )
  # Generate integer pool and weights

  size0 <- size - 1L
  pool.raw <- seq.int(2L ^ size) - 1L
  pool.raw.len <- valid.unique <- length(pool.raw)

  # weights are a function of how many trailing zeroes each number has, for
  # example `1000` has three trailing zeroes and represnts `1000`, `100`,
  # `10`, and `1`, so it should be weighed 4x

  weights <- rep(1L, pool.raw.len)
  for(i in seq.int(size0))
    weights[seq.int(from=1L, to=pool.raw.len, by=2 ^ i)] <- i + 1L

  # Create indices to map from the "weighted" vectors to the original
  # vectors

  pool.vals <- rep(pool.raw, weights)
  pool.len <- length(pool.vals)

  # For each repeated value, what count of trailing zeros does it correspond
  # to (equivalent to: `unlist(lapply(weights, seq.int))`, but faster)

  z <- integer(pool.len)
  z[c(1L, cumsum(head(weights, -1L)) + 1L)] <- 1L
  w <- cumsum(!z)
  t <- cummax(z * w)
  zeros.imp <- w - t + 1L

  # Generate our encoded vectors by right padding with enough zeros and then
  # adding as a value the number of zeros to the padded area

  zero.pad <- as.integer(2L ^ ceiling(log2(size)))
  vals.enc <- pool.vals * zero.pad + zeros.imp - 1L

  # Results tracking

  res <- matrix(0L, nrow=n, ncol=2L)
  res[, 2L] <- size      # leads to "" if not changed
  max.allowed <- size0   # how padded a number to pick can be

  # Pre compute frequently used sequences and number patterns

  zero.mx <- as.integer(2 ^ (size - seq(size))) *
    !lower.tri(matrix(ncol=size, nrow=size))
  seqs <- lapply(1L:size, seq.int)
  seqs0 <- lapply(seqs, `-`, 1L)
  seq.rev <- rev(seq.int(size))
  seq.rev0 <- seq.rev - 1L
  ones <- rep(1L, size)

  # Loop through the `n` requested samples

  for(i in seq.int(n)) {

    # Check for completeness, and remove values that would lead to incomplete
    # pools.

    if(complete.only) {
      if(max.allowed) {
        rem.pow <- which(n - i >= valid.unique - 2L ^ seqs[[max.allowed]])
        for(j in rev(rem.pow)) {
          to.rem <- which(
            weights[vals.enc %/% zero.pad + 1L] - vals.enc %% zero.pad - 1L ==
              max.allowed
          )

          vector_reduce(vals.enc, to.rem, to.rem)
          max.allowed <- max.allowed - 1L
        }
        if(!max.allowed && n - i >= length(vals.enc))
          stop(
            "Logic Error: pool is not large enough to support complete samples"
    ) } }
    if(!(pool.len.left <- length(vals.enc))) break
    val.enc <- if(pool.len.left > 1L) sample(vals.enc, 1L) else vals.enc

    # Figure out how many trailing zeros our number has (recall, this is
    # encoded in the least significant bits of our number); note, zeros is a bit
    # misleading, it means: "how many digits after initial digit are explicilty
    # specied".  The name `zeros` comes from numbers like `1` that would need to
    # add zeros to be specified (e.g. `1000`, which has three zeros)

    val <- val.enc %/% zero.pad
    enc <- val.enc %% zero.pad
    weight <- weights[[val + 1L]]
    zeros <- size - weight + enc
    pad <- size0 - zeros
    res[i, ] <- c(val, pad)

    # Based on number of zeros, we can figure out up to what value we need
    # to disqualify

    disq.hi.enc <- if(pad) {
      val.hi <- as.integer((val + 2L ^ pad - 1L))
      val.hi * zero.pad + weights[[val.hi + 1L]] - 1L
    } else val.enc

    # Incremental disqualification of smaller patterns by computing the
    # decimal value from a sequantially truncated bit matrix

    disq.val.enc <- if(zeros) {
      seq.z <- seqs[[zeros]]
      disqual.more.tmp <- as.integer(
        ones[seq.z] %*% (
          as.integer(intToBits(val)[seq.rev])[seq.z] *
          zero.mx[seq.z, seq.z, drop=F]
      ) )
      offset <- seqs0[[zeros]] + weights[disqual.more.tmp + 1L] - size
      disq.ind <- disqual.more.tmp * zero.pad + offset
      unique.default(disq.ind[which(disq.ind < val.enc)])
    } else integer()
    # Find values to remove

    disq.val.ind <- bin_find(vals.enc, c(val.enc, disq.hi.enc), TRUE)
    disq.val.extra <- bin_find(vals.enc, disq.val.enc, FALSE)
    disq.val.extra <- disq.val.extra[which(disq.val.extra > 0L)]

    lo <- disq.val.ind[[1L]]
    if(!lo) if(val.enc < vals.enc[[1L]]) lo <- 1L else
      stop("Logic Error, inconsistent indices")
    hi <- disq.val.ind[[2L]]
    if(!hi) if(disq.hi.enc > vals.enc[[pool.len.left]]) hi <- pool.len.left else
      stop("Logic Error, inconsistent indices")

    # Now remove values from our `val.enc` vector.  This is done by reference
    # with a C function.  Note `disq.val.extra` should be always have smaller
    # values than `lo` or `hi`.

    starts <- c(disq.val.extra, lo)
    ends <- c(disq.val.extra, hi)
    vector_reduce(vals.enc, start=starts,  end=ends) # modifies `val.enc.` by reference
    valid.unique <- valid.unique - length(disq.val.extra) - 2 ^ pad
  }
  # Now convert to binary representation; note we assume ints are 32 bits

  res.raw <- matrix(as.integer(intToBits(res[, 1L])), nrow=32L)[seq.rev, ]
  substr(do.call(paste0, split(res.raw, row(res.raw))), 0L, size - res[, 2L])
}
#' Use binary search to find indices in a vector by matching values

bin_find <- inline::cfunction(
  signature(vec="integer", values="integer", ifnf="logical"), "
  if(
    TYPEOF(vec) != INTSXP || TYPEOF(values) != INTSXP || TYPEOF(ifnf) != LGLSXP
  )
    error(\"Invalid inputs\");
  SEXP res;
  int *vals = INTEGER(values), *v = INTEGER(vec), i, vallen, min, max, mid,
    vlen, *resvals;
  vallen = LENGTH(values);
  vlen = LENGTH(vec);

  PROTECT(res = allocVector(INTSXP, LENGTH(values)));
  resvals = INTEGER(res);

  // can probably optimize this further by using info from each iterative binary
  // search to reduce subsequent search, or using non early termination algo
  // adapted from https://en.wikipedia.org/wiki/Binary_search_algorithm

  for(i = 0; i < vallen; i++) {
    min = 0;
    max = vlen - 1;
    int foundval = 0;

    while (max >= min) {
      mid = (max - min) / 2 + min;
      //if(k++ > 20) error(\"infinite loop\");
      //Rprintf(\"mid %d min %d max %d val %d vmax %d vmin %d\\n\", mid, min, max, vals[i], v[max], v[min]);
      if(v[mid] == vals[i]) {
        resvals[i] = mid + 1; // convert to base 1 index
        foundval = 1;
        break;
      }
      else if (v[mid] < vals[i])
        min = mid + 1;
      else
        max = mid - 1;
    }
    if(foundval) continue;
    if(asLogical(ifnf)) {    // Not found
      resvals[i] = min;
    } else resvals[i] = 0;
  }
  UNPROTECT(1);
  return(res);
  "
)
#' Reduce a vector by reference by removing values between corresponding entries
#' in starts and ends

vector_reduce <- inline::cfunction(
  signature(vec="integer", starts="integer", ends="integer"),
  "
  if(
    TYPEOF(starts) != INTSXP || TYPEOF(ends) != INTSXP || TYPEOF(vec) != INTSXP
  )
    error(\"Invalid inputs\");

  int remlen = LENGTH(starts), veclen = LENGTH(vec), l, offset=0, i, remind=0;
  int *vecvals = INTEGER(vec), *startvals=INTEGER(starts),
    *endvals=INTEGER(ends);

  if(!remlen) return ScalarInteger(0);
  if(LENGTH(ends) != remlen) error(\"`starts` and `ends` must be same length\");
  for(i = 0; i < remlen; i++) {
    int end_max=0;
    if(
      startvals[i] > endvals[i] || endvals[i] < startvals[i] ||
      (i > 0 && startvals[i] <= startvals[i - 1]) ||
      startvals[i] < 1
    )
      error(
        \"Inconsistent starts and/or ends %d %d at index %d\",
        startvals[i], endvals[i], i+1
      );
    if(i > 0 && startvals[i] < end_max)
      error(\"Nested Intervals\");
    if(endvals[i] > end_max) end_max = endvals[i];
  }
  // Should figure out which end to do this from, as in some case will be much
  // faster to shift back instead of forward, but won't bother for now

  l = startvals[remind] - 1;    // remember these are 1 indexed
  offset = endvals[remind] - l;
  remind++;

  while(1) {
    if(remind < remlen)
      while(l + offset >= startvals[remind] - 1 && remind < remlen) {
        offset += endvals[remind] - startvals[remind] + 1L;
        remind++;
      }
    if(l + offset >= veclen) break;
    vecvals[l] = vecvals[l + offset];
    l++;
  }
  SETLENGTH(vec, veclen - offset);
  return ScalarInteger(veclen - offset);
  "
)
	sample01101 <- function(size, n, complete.only=FALSE) {
	size <- as.integer(size)
	n <- as.integer(n)

	if(size > 25 \|\| size < 3L) stop(
	"Currently size min is 3 and max is 25, though should be possible to allow ",
	"smaller and larger with some changes"
	)
	# Generate integer pool and weights

	size0 <- size - 1L
	pool.raw <- seq.int(2L ^ size) - 1L
	pool.raw.len <- valid.unique <- length(pool.raw)

	# weights are a function of how many trailing zeroes each number has, for
	# example `1000` has three trailing zeroes and represnts `1000`, `100`,
	# `10`, and `1`, so it should be weighed 4x

	weights <- rep(1L, pool.raw.len)
	for(i in seq.int(size0))
	weights[seq.int(from=1L, to=pool.raw.len, by=2 ^ i)] <- i + 1L

	# Create indices to map from the "weighted" vectors to the original
	# vectors

	pool.vals <- rep(pool.raw, weights)
	pool.len <- length(pool.vals)

	# For each repeated value, what count of trailing zeros does it correspond
	# to (equivalent to: `unlist(lapply(weights, seq.int))`, but faster)

	z <- integer(pool.len)
	z[c(1L, cumsum(head(weights, -1L)) + 1L)] <- 1L
	w <- cumsum(!z)
	t <- cummax(z * w)
	zeros.imp <- w - t + 1L

	# Generate our encoded vectors by right padding with enough zeros and then
	# adding as a value the number of zeros to the padded area

	zero.pad <- as.integer(2L ^ ceiling(log2(size)))
	vals.enc <- pool.vals * zero.pad + zeros.imp - 1L

	# Results tracking

	res <- matrix(0L, nrow=n, ncol=2L)
	res[, 2L] <- size # leads to "" if not changed
	max.allowed <- size0 # how padded a number to pick can be

	# Pre compute frequently used sequences and number patterns

	zero.mx <- as.integer(2 ^ (size - seq(size))) *
	!lower.tri(matrix(ncol=size, nrow=size))
	seqs <- lapply(1L:size, seq.int)
	seqs0 <- lapply(seqs, `-`, 1L)
	seq.rev <- rev(seq.int(size))
	seq.rev0 <- seq.rev - 1L
	ones <- rep(1L, size)

	# Loop through the `n` requested samples

	for(i in seq.int(n)) {

	# Check for completeness, and remove values that would lead to incomplete
	# pools.

	if(complete.only) {
	if(max.allowed) {
	rem.pow <- which(n - i >= valid.unique - 2L ^ seqs[[max.allowed]])
	for(j in rev(rem.pow)) {
	to.rem <- which(
	weights[vals.enc %/% zero.pad + 1L] - vals.enc %% zero.pad - 1L ==
	max.allowed
	)

	vector_reduce(vals.enc, to.rem, to.rem)
	max.allowed <- max.allowed - 1L
	}
	if(!max.allowed && n - i >= length(vals.enc))
	stop(
	"Logic Error: pool is not large enough to support complete samples"
	) } }
	if(!(pool.len.left <- length(vals.enc))) break
	val.enc <- if(pool.len.left > 1L) sample(vals.enc, 1L) else vals.enc

	# Figure out how many trailing zeros our number has (recall, this is
	# encoded in the least significant bits of our number); note, zeros is a bit
	# misleading, it means: "how many digits after initial digit are explicilty
	# specied". The name `zeros` comes from numbers like `1` that would need to
	# add zeros to be specified (e.g. `1000`, which has three zeros)

	val <- val.enc %/% zero.pad
	enc <- val.enc %% zero.pad
	weight <- weights[[val + 1L]]
	zeros <- size - weight + enc
	pad <- size0 - zeros
	res[i, ] <- c(val, pad)

	# Based on number of zeros, we can figure out up to what value we need
	# to disqualify

	disq.hi.enc <- if(pad) {
	val.hi <- as.integer((val + 2L ^ pad - 1L))
	val.hi * zero.pad + weights[[val.hi + 1L]] - 1L
	} else val.enc

	# Incremental disqualification of smaller patterns by computing the
	# decimal value from a sequantially truncated bit matrix

	disq.val.enc <- if(zeros) {
	seq.z <- seqs[[zeros]]
	disqual.more.tmp <- as.integer(
	ones[seq.z] %*% (
	as.integer(intToBits(val)[seq.rev])[seq.z] *
	zero.mx[seq.z, seq.z, drop=F]
	) )
	offset <- seqs0[[zeros]] + weights[disqual.more.tmp + 1L] - size
	disq.ind <- disqual.more.tmp * zero.pad + offset
	unique.default(disq.ind[which(disq.ind < val.enc)])
	} else integer()
	# Find values to remove

	disq.val.ind <- bin_find(vals.enc, c(val.enc, disq.hi.enc), TRUE)
	disq.val.extra <- bin_find(vals.enc, disq.val.enc, FALSE)
	disq.val.extra <- disq.val.extra[which(disq.val.extra > 0L)]

	lo <- disq.val.ind[[1L]]
	if(!lo) if(val.enc < vals.enc[[1L]]) lo <- 1L else
	stop("Logic Error, inconsistent indices")
	hi <- disq.val.ind[[2L]]
	if(!hi) if(disq.hi.enc > vals.enc[[pool.len.left]]) hi <- pool.len.left else
	stop("Logic Error, inconsistent indices")

	# Now remove values from our `val.enc` vector. This is done by reference
	# with a C function. Note `disq.val.extra` should be always have smaller
	# values than `lo` or `hi`.

	starts <- c(disq.val.extra, lo)
	ends <- c(disq.val.extra, hi)
	vector_reduce(vals.enc, start=starts, end=ends) # modifies `val.enc.` by reference
	valid.unique <- valid.unique - length(disq.val.extra) - 2 ^ pad
	}
	# Now convert to binary representation; note we assume ints are 32 bits

	res.raw <- matrix(as.integer(intToBits(res[, 1L])), nrow=32L)[seq.rev, ]
	substr(do.call(paste0, split(res.raw, row(res.raw))), 0L, size - res[, 2L])
	}
	#' Use binary search to find indices in a vector by matching values

	bin_find <- inline::cfunction(
	signature(vec="integer", values="integer", ifnf="logical"), "
	if(
	TYPEOF(vec) != INTSXP \|\| TYPEOF(values) != INTSXP \|\| TYPEOF(ifnf) != LGLSXP
	)
	error(\"Invalid inputs\");
	SEXP res;
	int vals = INTEGER(values), v = INTEGER(vec), i, vallen, min, max, mid,
	vlen, *resvals;
	vallen = LENGTH(values);
	vlen = LENGTH(vec);

	PROTECT(res = allocVector(INTSXP, LENGTH(values)));
	resvals = INTEGER(res);

	// can probably optimize this further by using info from each iterative binary
	// search to reduce subsequent search, or using non early termination algo
	// adapted from https://en.wikipedia.org/wiki/Binary_search_algorithm

	for(i = 0; i < vallen; i++) {
	min = 0;
	max = vlen - 1;
	int foundval = 0;

	while (max >= min) {
	mid = (max - min) / 2 + min;
	//if(k++ > 20) error(\"infinite loop\");
	//Rprintf(\"mid %d min %d max %d val %d vmax %d vmin %d\\n\", mid, min, max, vals[i], v[max], v[min]);
	if(v[mid] == vals[i]) {
	resvals[i] = mid + 1; // convert to base 1 index
	foundval = 1;
	break;
	}
	else if (v[mid] < vals[i])
	min = mid + 1;
	else
	max = mid - 1;
	}
	if(foundval) continue;
	if(asLogical(ifnf)) { // Not found
	resvals[i] = min;
	} else resvals[i] = 0;
	}
	UNPROTECT(1);
	return(res);
	"
	)
	#' Reduce a vector by reference by removing values between corresponding entries
	#' in starts and ends

	vector_reduce <- inline::cfunction(
	signature(vec="integer", starts="integer", ends="integer"),
	"
	if(
	TYPEOF(starts) != INTSXP \|\| TYPEOF(ends) != INTSXP \|\| TYPEOF(vec) != INTSXP
	)
	error(\"Invalid inputs\");

	int remlen = LENGTH(starts), veclen = LENGTH(vec), l, offset=0, i, remind=0;
	int vecvals = INTEGER(vec), startvals=INTEGER(starts),
	*endvals=INTEGER(ends);

	if(!remlen) return ScalarInteger(0);
	if(LENGTH(ends) != remlen) error(\"`starts` and `ends` must be same length\");
	for(i = 0; i < remlen; i++) {
	int end_max=0;
	if(
	startvals[i] > endvals[i] \|\| endvals[i] < startvals[i] \|\|
	(i > 0 && startvals[i] <= startvals[i - 1]) \|\|
	startvals[i] < 1
	)
	error(
	\"Inconsistent starts and/or ends %d %d at index %d\",
	startvals[i], endvals[i], i+1
	);
	if(i > 0 && startvals[i] < end_max)
	error(\"Nested Intervals\");
	if(endvals[i] > end_max) end_max = endvals[i];
	}
	// Should figure out which end to do this from, as in some case will be much
	// faster to shift back instead of forward, but won't bother for now

	l = startvals[remind] - 1; // remember these are 1 indexed
	offset = endvals[remind] - l;
	remind++;

	while(1) {
	if(remind < remlen)
	while(l + offset >= startvals[remind] - 1 && remind < remlen) {
	offset += endvals[remind] - startvals[remind] + 1L;
	remind++;
	}
	if(l + offset >= veclen) break;
	vecvals[l] = vecvals[l + offset];
	l++;
	}
	SETLENGTH(vec, veclen - offset);
	return ScalarInteger(veclen - offset);
	"
	)