Last active
August 29, 2015 14:23
-
-
Save brodieG/6e3deafb5bca82a46bf9 to your computer and use it in GitHub Desktop.
Random sampling from 01010 with some c implementations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sample01101 <- function(size, n, complete.only=FALSE) { | |
size <- as.integer(size) | |
n <- as.integer(n) | |
if(size > 25 || size < 3L) stop( | |
"Currently size min is 3 and max is 25, though should be possible to allow ", | |
"smaller and larger with some changes" | |
) | |
# Generate integer pool and weights | |
size0 <- size - 1L | |
pool.raw <- seq.int(2L ^ size) - 1L | |
pool.raw.len <- valid.unique <- length(pool.raw) | |
# weights are a function of how many trailing zeroes each number has, for | |
# example `1000` has three trailing zeroes and represnts `1000`, `100`, | |
# `10`, and `1`, so it should be weighed 4x | |
weights <- rep(1L, pool.raw.len) | |
for(i in seq.int(size0)) | |
weights[seq.int(from=1L, to=pool.raw.len, by=2 ^ i)] <- i + 1L | |
# Create indices to map from the "weighted" vectors to the original | |
# vectors | |
pool.vals <- rep(pool.raw, weights) | |
pool.len <- length(pool.vals) | |
# For each repeated value, what count of trailing zeros does it correspond | |
# to (equivalent to: `unlist(lapply(weights, seq.int))`, but faster) | |
z <- integer(pool.len) | |
z[c(1L, cumsum(head(weights, -1L)) + 1L)] <- 1L | |
w <- cumsum(!z) | |
t <- cummax(z * w) | |
zeros.imp <- w - t + 1L | |
# Generate our encoded vectors by right padding with enough zeros and then | |
# adding as a value the number of zeros to the padded area | |
zero.pad <- as.integer(2L ^ ceiling(log2(size))) | |
vals.enc <- pool.vals * zero.pad + zeros.imp - 1L | |
# Results tracking | |
res <- matrix(0L, nrow=n, ncol=2L) | |
res[, 2L] <- size # leads to "" if not changed | |
max.allowed <- size0 # how padded a number to pick can be | |
# Pre compute frequently used sequences and number patterns | |
zero.mx <- as.integer(2 ^ (size - seq(size))) * | |
!lower.tri(matrix(ncol=size, nrow=size)) | |
seqs <- lapply(1L:size, seq.int) | |
seqs0 <- lapply(seqs, `-`, 1L) | |
seq.rev <- rev(seq.int(size)) | |
seq.rev0 <- seq.rev - 1L | |
ones <- rep(1L, size) | |
# Loop through the `n` requested samples | |
for(i in seq.int(n)) { | |
# Check for completeness, and remove values that would lead to incomplete | |
# pools. | |
if(complete.only) { | |
if(max.allowed) { | |
rem.pow <- which(n - i >= valid.unique - 2L ^ seqs[[max.allowed]]) | |
for(j in rev(rem.pow)) { | |
to.rem <- which( | |
weights[vals.enc %/% zero.pad + 1L] - vals.enc %% zero.pad - 1L == | |
max.allowed | |
) | |
vector_reduce(vals.enc, to.rem, to.rem) | |
max.allowed <- max.allowed - 1L | |
} | |
if(!max.allowed && n - i >= length(vals.enc)) | |
stop( | |
"Logic Error: pool is not large enough to support complete samples" | |
) } } | |
if(!(pool.len.left <- length(vals.enc))) break | |
val.enc <- if(pool.len.left > 1L) sample(vals.enc, 1L) else vals.enc | |
# Figure out how many trailing zeros our number has (recall, this is | |
# encoded in the least significant bits of our number); note, zeros is a bit | |
# misleading, it means: "how many digits after initial digit are explicilty | |
# specied". The name `zeros` comes from numbers like `1` that would need to | |
# add zeros to be specified (e.g. `1000`, which has three zeros) | |
val <- val.enc %/% zero.pad | |
enc <- val.enc %% zero.pad | |
weight <- weights[[val + 1L]] | |
zeros <- size - weight + enc | |
pad <- size0 - zeros | |
res[i, ] <- c(val, pad) | |
# Based on number of zeros, we can figure out up to what value we need | |
# to disqualify | |
disq.hi.enc <- if(pad) { | |
val.hi <- as.integer((val + 2L ^ pad - 1L)) | |
val.hi * zero.pad + weights[[val.hi + 1L]] - 1L | |
} else val.enc | |
# Incremental disqualification of smaller patterns by computing the | |
# decimal value from a sequantially truncated bit matrix | |
disq.val.enc <- if(zeros) { | |
seq.z <- seqs[[zeros]] | |
disqual.more.tmp <- as.integer( | |
ones[seq.z] %*% ( | |
as.integer(intToBits(val)[seq.rev])[seq.z] * | |
zero.mx[seq.z, seq.z, drop=F] | |
) ) | |
offset <- seqs0[[zeros]] + weights[disqual.more.tmp + 1L] - size | |
disq.ind <- disqual.more.tmp * zero.pad + offset | |
unique.default(disq.ind[which(disq.ind < val.enc)]) | |
} else integer() | |
# Find values to remove | |
disq.val.ind <- bin_find(vals.enc, c(val.enc, disq.hi.enc), TRUE) | |
disq.val.extra <- bin_find(vals.enc, disq.val.enc, FALSE) | |
disq.val.extra <- disq.val.extra[which(disq.val.extra > 0L)] | |
lo <- disq.val.ind[[1L]] | |
if(!lo) if(val.enc < vals.enc[[1L]]) lo <- 1L else | |
stop("Logic Error, inconsistent indices") | |
hi <- disq.val.ind[[2L]] | |
if(!hi) if(disq.hi.enc > vals.enc[[pool.len.left]]) hi <- pool.len.left else | |
stop("Logic Error, inconsistent indices") | |
# Now remove values from our `val.enc` vector. This is done by reference | |
# with a C function. Note `disq.val.extra` should be always have smaller | |
# values than `lo` or `hi`. | |
starts <- c(disq.val.extra, lo) | |
ends <- c(disq.val.extra, hi) | |
vector_reduce(vals.enc, start=starts, end=ends) # modifies `val.enc.` by reference | |
valid.unique <- valid.unique - length(disq.val.extra) - 2 ^ pad | |
} | |
# Now convert to binary representation; note we assume ints are 32 bits | |
res.raw <- matrix(as.integer(intToBits(res[, 1L])), nrow=32L)[seq.rev, ] | |
substr(do.call(paste0, split(res.raw, row(res.raw))), 0L, size - res[, 2L]) | |
} | |
#' Use binary search to find indices in a vector by matching values | |
bin_find <- inline::cfunction( | |
signature(vec="integer", values="integer", ifnf="logical"), " | |
if( | |
TYPEOF(vec) != INTSXP || TYPEOF(values) != INTSXP || TYPEOF(ifnf) != LGLSXP | |
) | |
error(\"Invalid inputs\"); | |
SEXP res; | |
int *vals = INTEGER(values), *v = INTEGER(vec), i, vallen, min, max, mid, | |
vlen, *resvals; | |
vallen = LENGTH(values); | |
vlen = LENGTH(vec); | |
PROTECT(res = allocVector(INTSXP, LENGTH(values))); | |
resvals = INTEGER(res); | |
// can probably optimize this further by using info from each iterative binary | |
// search to reduce subsequent search, or using non early termination algo | |
// adapted from https://en.wikipedia.org/wiki/Binary_search_algorithm | |
for(i = 0; i < vallen; i++) { | |
min = 0; | |
max = vlen - 1; | |
int foundval = 0; | |
while (max >= min) { | |
mid = (max - min) / 2 + min; | |
//if(k++ > 20) error(\"infinite loop\"); | |
//Rprintf(\"mid %d min %d max %d val %d vmax %d vmin %d\\n\", mid, min, max, vals[i], v[max], v[min]); | |
if(v[mid] == vals[i]) { | |
resvals[i] = mid + 1; // convert to base 1 index | |
foundval = 1; | |
break; | |
} | |
else if (v[mid] < vals[i]) | |
min = mid + 1; | |
else | |
max = mid - 1; | |
} | |
if(foundval) continue; | |
if(asLogical(ifnf)) { // Not found | |
resvals[i] = min; | |
} else resvals[i] = 0; | |
} | |
UNPROTECT(1); | |
return(res); | |
" | |
) | |
#' Reduce a vector by reference by removing values between corresponding entries | |
#' in starts and ends | |
vector_reduce <- inline::cfunction( | |
signature(vec="integer", starts="integer", ends="integer"), | |
" | |
if( | |
TYPEOF(starts) != INTSXP || TYPEOF(ends) != INTSXP || TYPEOF(vec) != INTSXP | |
) | |
error(\"Invalid inputs\"); | |
int remlen = LENGTH(starts), veclen = LENGTH(vec), l, offset=0, i, remind=0; | |
int *vecvals = INTEGER(vec), *startvals=INTEGER(starts), | |
*endvals=INTEGER(ends); | |
if(!remlen) return ScalarInteger(0); | |
if(LENGTH(ends) != remlen) error(\"`starts` and `ends` must be same length\"); | |
for(i = 0; i < remlen; i++) { | |
int end_max=0; | |
if( | |
startvals[i] > endvals[i] || endvals[i] < startvals[i] || | |
(i > 0 && startvals[i] <= startvals[i - 1]) || | |
startvals[i] < 1 | |
) | |
error( | |
\"Inconsistent starts and/or ends %d %d at index %d\", | |
startvals[i], endvals[i], i+1 | |
); | |
if(i > 0 && startvals[i] < end_max) | |
error(\"Nested Intervals\"); | |
if(endvals[i] > end_max) end_max = endvals[i]; | |
} | |
// Should figure out which end to do this from, as in some case will be much | |
// faster to shift back instead of forward, but won't bother for now | |
l = startvals[remind] - 1; // remember these are 1 indexed | |
offset = endvals[remind] - l; | |
remind++; | |
while(1) { | |
if(remind < remlen) | |
while(l + offset >= startvals[remind] - 1 && remind < remlen) { | |
offset += endvals[remind] - startvals[remind] + 1L; | |
remind++; | |
} | |
if(l + offset >= veclen) break; | |
vecvals[l] = vecvals[l + offset]; | |
l++; | |
} | |
SETLENGTH(vec, veclen - offset); | |
return ScalarInteger(veclen - offset); | |
" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment