Skip to content

Instantly share code, notes, and snippets.

@brodieG
Last active August 29, 2015 14:23
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brodieG/dad62cb00f8337635874 to your computer and use it in GitHub Desktop.
Save brodieG/dad62cb00f8337635874 to your computer and use it in GitHub Desktop.
Code for SO Q: Random sample of character vector, without elements prefixing one another
sample0110 <- function(size, n, complete.only=FALSE) {
size <- as.integer(size)
n <- as.integer(n)
if(size > 25 || size < 3L) stop(
"Currently size min is 3 and max is 25, though should be possible to allow ",
"smaller and larger with some changes"
)
# Generate integer pool and weights
size0 <- size - 1L
pool.raw <- seq.int(2L ^ size) - 1L
pool.raw.len <- valid.unique <- length(pool.raw)
# weights are a function of how many trailing zeroes each number has, for
# example `1000` has three trailing zeroes and represnts `1000`, `100`,
# `10`, and `1`, so it should be weighed 4x
weights <- rep(1L, pool.raw.len)
for(i in seq.int(size0))
weights[seq.int(from=1L, to=pool.raw.len, by=2 ^ i)] <- i + 1L
# Create indices to map from the "weighted" vectors to the original
# vectors
pool.vals <- rep(pool.raw, weights)
pool.len <- length(pool.vals)
# For each repeated value, what count of trailing zeros does it correspond
# to (equivalent to: `unlist(lapply(weights, seq.int))`, but faster)
z <- integer(pool.len)
z[c(1L, cumsum(head(weights, -1L)) + 1L)] <- 1L
w <- cumsum(!z)
t <- cummax(z * w)
zeros.imp <- w - t + 1L
pad.imp <- weights[pool.vals + 1L] - zeros.imp
# Generate our encoded vectors by right padding with enough zeros and then
# adding as a value the number of zeros to the padded area
zero.pad <- as.integer(2L ^ ceiling(log2(size)))
vals.enc <- vals.enc.init <- pool.vals * zero.pad + zeros.imp - 1L
# Results tracking
res <- matrix(0L, nrow=n, ncol=2L)
res[, 2L] <- size # leads to "" if not changed
max.allowed <- size0 # how padded a number to pick can be
free <- rep(TRUE, pool.len)
# Pre compute frequently used sequences and number patterns
zero.mx <- as.integer(2 ^ (size - seq(size))) *
!lower.tri(matrix(ncol=size, nrow=size))
seqs <- lapply(1L:size, seq.int)
seqs0 <- lapply(seqs, `-`, 1L)
seq.rev <- rev(seq.int(size))
seq.rev0 <- seq.rev - 1L
ones <- rep(1L, size)
weights.cs <- cumsum(weights)
pool.lu <- c(1L, head(weights.cs, -1L) + 1L)
# Loop through the `n` requested samples
for(i in seq.int(n)) {
# Check for completeness, and remove values that would lead to incomplete
# pools. We only remove padded values so `valid.unique` is unchanged
if(complete.only) {
if(max.allowed) {
rem.pow <- which(n - i >= valid.unique - 2L ^ seqs[[max.allowed]])
for(j in rev(rem.pow)) {
to.rem <- which(pad.imp == max.allowed)
free[to.rem] <- FALSE
max.allowed <- max.allowed - 1L
}
if(!max.allowed && n - i >= valid.unique)
stop(
"Logic Error: pool is not large enough to support complete samples"
) } }
vals.enc <- vals.enc.init[which(free)]
if(!(pool.len.left <- length(vals.enc))) break
val.enc <- if(pool.len.left > 1L) sample(vals.enc, 1L) else vals.enc
# Figure out how many trailing zeros our number has (recall, this is
# encoded in the least significant bits of our number); note, zeros is a bit
# misleading, it means: "how many digits after initial digit are explicilty
# specied". The name `zeros` comes from numbers like `1` that would need to
# add zeros to be specified (e.g. `1000`, which has three zeros)
val <- val.enc %/% zero.pad
enc <- val.enc %% zero.pad
weight <- weights[[val + 1L]]
zeros <- size - weight + enc
pad <- size0 - zeros
res[i, ] <- c(val, pad)
# Based on number of zeros, we can figure out up to what value we need
# to disqualify (NOTE: different than withbin, here we get the next value
# greater than our range because `free` is always same size)
disq.hi.enc <- as.integer((val + 2L ^ pad)) * zero.pad
# Incremental disqualification of smaller patterns by computing the
# decimal value from a sequantially truncated bit matrix
disq.loc.extra <- if(zeros) {
seq.z <- seqs[[zeros]]
disqual.more.tmp <- as.integer(
ones[seq.z] %*% (
as.integer(intToBits(val)[seq.rev])[seq.z] *
zero.mx[seq.z, seq.z, drop=F]
) )
ws <- weights[disqual.more.tmp + 1L]
offset <- seqs0[[zeros]] + ws - size
disq.loc <- pool.lu[disqual.more.tmp + 1L] + offset
disqualifiable <- which(disqual.more.tmp < val)
valid.unique <- valid.unique - sum(!(ws - offset - 1L)[disqualifiable])
unique.default(disq.loc[disqualifiable])
} else integer()
# Find values to remove, first with the range of values disqualified by our
# pick
lo <- val.enc %/% zero.pad + 1L
hi <- disq.hi.enc %/% zero.pad + 1L
free[
seq.int(
from=pool.lu[[lo]],
to=max(pool.lu[[lo]], pool.lu[[hi - 1L]] + weights[[hi - 1L]] - 1L)
) ] <- FALSE
# Now remove any parent values
free[disq.loc.extra] <- FALSE
valid.unique <- valid.unique - 2 ^ pad # Not certain this works cleanly in this version
}
# Now convert to binary representation; note we assume ints are 32 bits
res.raw <- matrix(as.integer(intToBits(res[, 1L])), nrow=32L)[seq.rev, ]
substr(do.call(paste0, split(res.raw, row(res.raw))), 0L, size - res[, 2L])
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment