Skip to content

Instantly share code, notes, and snippets.

@brodieG
Last active August 29, 2015 14:23
Show Gist options
  • Save brodieG/6e3deafb5bca82a46bf9 to your computer and use it in GitHub Desktop.
Save brodieG/6e3deafb5bca82a46bf9 to your computer and use it in GitHub Desktop.
Random sampling from 01010 with some c implementations
sample01101 <- function(size, n, complete.only=FALSE) {
size <- as.integer(size)
n <- as.integer(n)
if(size > 25 || size < 3L) stop(
"Currently size min is 3 and max is 25, though should be possible to allow ",
"smaller and larger with some changes"
)
# Generate integer pool and weights
size0 <- size - 1L
pool.raw <- seq.int(2L ^ size) - 1L
pool.raw.len <- valid.unique <- length(pool.raw)
# weights are a function of how many trailing zeroes each number has, for
# example `1000` has three trailing zeroes and represnts `1000`, `100`,
# `10`, and `1`, so it should be weighed 4x
weights <- rep(1L, pool.raw.len)
for(i in seq.int(size0))
weights[seq.int(from=1L, to=pool.raw.len, by=2 ^ i)] <- i + 1L
# Create indices to map from the "weighted" vectors to the original
# vectors
pool.vals <- rep(pool.raw, weights)
pool.len <- length(pool.vals)
# For each repeated value, what count of trailing zeros does it correspond
# to (equivalent to: `unlist(lapply(weights, seq.int))`, but faster)
z <- integer(pool.len)
z[c(1L, cumsum(head(weights, -1L)) + 1L)] <- 1L
w <- cumsum(!z)
t <- cummax(z * w)
zeros.imp <- w - t + 1L
# Generate our encoded vectors by right padding with enough zeros and then
# adding as a value the number of zeros to the padded area
zero.pad <- as.integer(2L ^ ceiling(log2(size)))
vals.enc <- pool.vals * zero.pad + zeros.imp - 1L
# Results tracking
res <- matrix(0L, nrow=n, ncol=2L)
res[, 2L] <- size # leads to "" if not changed
max.allowed <- size0 # how padded a number to pick can be
# Pre compute frequently used sequences and number patterns
zero.mx <- as.integer(2 ^ (size - seq(size))) *
!lower.tri(matrix(ncol=size, nrow=size))
seqs <- lapply(1L:size, seq.int)
seqs0 <- lapply(seqs, `-`, 1L)
seq.rev <- rev(seq.int(size))
seq.rev0 <- seq.rev - 1L
ones <- rep(1L, size)
# Loop through the `n` requested samples
for(i in seq.int(n)) {
# Check for completeness, and remove values that would lead to incomplete
# pools.
if(complete.only) {
if(max.allowed) {
rem.pow <- which(n - i >= valid.unique - 2L ^ seqs[[max.allowed]])
for(j in rev(rem.pow)) {
to.rem <- which(
weights[vals.enc %/% zero.pad + 1L] - vals.enc %% zero.pad - 1L ==
max.allowed
)
vector_reduce(vals.enc, to.rem, to.rem)
max.allowed <- max.allowed - 1L
}
if(!max.allowed && n - i >= length(vals.enc))
stop(
"Logic Error: pool is not large enough to support complete samples"
) } }
if(!(pool.len.left <- length(vals.enc))) break
val.enc <- if(pool.len.left > 1L) sample(vals.enc, 1L) else vals.enc
# Figure out how many trailing zeros our number has (recall, this is
# encoded in the least significant bits of our number); note, zeros is a bit
# misleading, it means: "how many digits after initial digit are explicilty
# specied". The name `zeros` comes from numbers like `1` that would need to
# add zeros to be specified (e.g. `1000`, which has three zeros)
val <- val.enc %/% zero.pad
enc <- val.enc %% zero.pad
weight <- weights[[val + 1L]]
zeros <- size - weight + enc
pad <- size0 - zeros
res[i, ] <- c(val, pad)
# Based on number of zeros, we can figure out up to what value we need
# to disqualify
disq.hi.enc <- if(pad) {
val.hi <- as.integer((val + 2L ^ pad - 1L))
val.hi * zero.pad + weights[[val.hi + 1L]] - 1L
} else val.enc
# Incremental disqualification of smaller patterns by computing the
# decimal value from a sequantially truncated bit matrix
disq.val.enc <- if(zeros) {
seq.z <- seqs[[zeros]]
disqual.more.tmp <- as.integer(
ones[seq.z] %*% (
as.integer(intToBits(val)[seq.rev])[seq.z] *
zero.mx[seq.z, seq.z, drop=F]
) )
offset <- seqs0[[zeros]] + weights[disqual.more.tmp + 1L] - size
disq.ind <- disqual.more.tmp * zero.pad + offset
unique.default(disq.ind[which(disq.ind < val.enc)])
} else integer()
# Find values to remove
disq.val.ind <- bin_find(vals.enc, c(val.enc, disq.hi.enc), TRUE)
disq.val.extra <- bin_find(vals.enc, disq.val.enc, FALSE)
disq.val.extra <- disq.val.extra[which(disq.val.extra > 0L)]
lo <- disq.val.ind[[1L]]
if(!lo) if(val.enc < vals.enc[[1L]]) lo <- 1L else
stop("Logic Error, inconsistent indices")
hi <- disq.val.ind[[2L]]
if(!hi) if(disq.hi.enc > vals.enc[[pool.len.left]]) hi <- pool.len.left else
stop("Logic Error, inconsistent indices")
# Now remove values from our `val.enc` vector. This is done by reference
# with a C function. Note `disq.val.extra` should be always have smaller
# values than `lo` or `hi`.
starts <- c(disq.val.extra, lo)
ends <- c(disq.val.extra, hi)
vector_reduce(vals.enc, start=starts, end=ends) # modifies `val.enc.` by reference
valid.unique <- valid.unique - length(disq.val.extra) - 2 ^ pad
}
# Now convert to binary representation; note we assume ints are 32 bits
res.raw <- matrix(as.integer(intToBits(res[, 1L])), nrow=32L)[seq.rev, ]
substr(do.call(paste0, split(res.raw, row(res.raw))), 0L, size - res[, 2L])
}
#' Use binary search to find indices in a vector by matching values
bin_find <- inline::cfunction(
signature(vec="integer", values="integer", ifnf="logical"), "
if(
TYPEOF(vec) != INTSXP || TYPEOF(values) != INTSXP || TYPEOF(ifnf) != LGLSXP
)
error(\"Invalid inputs\");
SEXP res;
int *vals = INTEGER(values), *v = INTEGER(vec), i, vallen, min, max, mid,
vlen, *resvals;
vallen = LENGTH(values);
vlen = LENGTH(vec);
PROTECT(res = allocVector(INTSXP, LENGTH(values)));
resvals = INTEGER(res);
// can probably optimize this further by using info from each iterative binary
// search to reduce subsequent search, or using non early termination algo
// adapted from https://en.wikipedia.org/wiki/Binary_search_algorithm
for(i = 0; i < vallen; i++) {
min = 0;
max = vlen - 1;
int foundval = 0;
while (max >= min) {
mid = (max - min) / 2 + min;
//if(k++ > 20) error(\"infinite loop\");
//Rprintf(\"mid %d min %d max %d val %d vmax %d vmin %d\\n\", mid, min, max, vals[i], v[max], v[min]);
if(v[mid] == vals[i]) {
resvals[i] = mid + 1; // convert to base 1 index
foundval = 1;
break;
}
else if (v[mid] < vals[i])
min = mid + 1;
else
max = mid - 1;
}
if(foundval) continue;
if(asLogical(ifnf)) { // Not found
resvals[i] = min;
} else resvals[i] = 0;
}
UNPROTECT(1);
return(res);
"
)
#' Reduce a vector by reference by removing values between corresponding entries
#' in starts and ends
vector_reduce <- inline::cfunction(
signature(vec="integer", starts="integer", ends="integer"),
"
if(
TYPEOF(starts) != INTSXP || TYPEOF(ends) != INTSXP || TYPEOF(vec) != INTSXP
)
error(\"Invalid inputs\");
int remlen = LENGTH(starts), veclen = LENGTH(vec), l, offset=0, i, remind=0;
int *vecvals = INTEGER(vec), *startvals=INTEGER(starts),
*endvals=INTEGER(ends);
if(!remlen) return ScalarInteger(0);
if(LENGTH(ends) != remlen) error(\"`starts` and `ends` must be same length\");
for(i = 0; i < remlen; i++) {
int end_max=0;
if(
startvals[i] > endvals[i] || endvals[i] < startvals[i] ||
(i > 0 && startvals[i] <= startvals[i - 1]) ||
startvals[i] < 1
)
error(
\"Inconsistent starts and/or ends %d %d at index %d\",
startvals[i], endvals[i], i+1
);
if(i > 0 && startvals[i] < end_max)
error(\"Nested Intervals\");
if(endvals[i] > end_max) end_max = endvals[i];
}
// Should figure out which end to do this from, as in some case will be much
// faster to shift back instead of forward, but won't bother for now
l = startvals[remind] - 1; // remember these are 1 indexed
offset = endvals[remind] - l;
remind++;
while(1) {
if(remind < remlen)
while(l + offset >= startvals[remind] - 1 && remind < remlen) {
offset += endvals[remind] - startvals[remind] + 1L;
remind++;
}
if(l + offset >= veclen) break;
vecvals[l] = vecvals[l + offset];
l++;
}
SETLENGTH(vec, veclen - offset);
return ScalarInteger(veclen - offset);
"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment