Last active
August 29, 2015 14:23
-
-
Save brodieG/160405262b53b0bb82bf to your computer and use it in GitHub Desktop.
Benchmarks for Random sampling from 01010 etc.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
make_pool <- function(size) | |
sort( | |
unlist( | |
lapply( | |
seq_len(size), | |
function(x) do.call(paste0, expand.grid(rep(list(c('0', '1')), x))) | |
) ) ) | |
system.time(pool4 <- make_pool(4)) | |
system.time(pool8 <- make_pool(8)) | |
system.time(pool16 <- make_pool(16)) | |
pools <- list("4"=pool4, "8"=pool8, "16"=pool16) | |
# = All Funs with a `size`, `n` interface ====================================== | |
#------------------------------------------------------------------------------- | |
jbaum <- function(size, n) { | |
pool <- pools[[as.character(size)]] | |
chosen <- sample(pool, n) | |
while(any(rowSums(outer(paste0('^', chosen), chosen, Vectorize(grepl))) > 1)) { | |
prefixes <- rowSums(outer(paste0('^', chosen), chosen, Vectorize(grepl))) > 1 | |
pool <- pool[rowSums(Vectorize(grepl, 'pattern')( | |
paste0('^', chosen[!prefixes]), pool)) == 0] | |
chosen <- c(chosen[!prefixes], sample(pool, sum(prefixes))) | |
} | |
chosen | |
} | |
#------------------------------------------------------------------------------- | |
library(data.table) | |
get_01str <- function(id,max_len){ | |
cuts = cumsum(2^(1:max_len-1)) | |
g = findInterval(id,cuts) | |
gid = id-cuts[g]+1 | |
data.table(g,gid)[,s:= | |
do.call(paste,c(list(sep=""),lapply( | |
seq(g[1]), | |
function(x) (gid-1) %/% 2^(x-1) %% 2 | |
))) | |
,by=g]$s | |
} | |
get_nixstrs <- function(g,gid,max_len){ | |
cuts = cumsum(2^(1:max_len-1)) | |
gids_child = { | |
x = gid%%2^sequence(g-1) | |
ifelse(x,x,2^sequence(g-1)) | |
} | |
ids_child = gids_child+cuts[sequence(g-1)]-1 | |
ids_parent = if (g==max_len) gid+cuts[g]-1 else { | |
gids_par = vector(mode="list",max_len) | |
gids_par[[g]] = gid | |
for (gg in seq(g,max_len-1)) | |
gids_par[[gg+1]] = c(gids_par[[gg]],gids_par[[gg]]+2^gg) | |
unlist(mapply(`+`,gids_par,cuts-1)) | |
} | |
c(ids_child,ids_parent) | |
} | |
drawem <- function(n,max_len){ | |
cuts = cumsum(2^(1:max_len-1)) | |
inds_by_g = mapply(seq,cuts,cuts*2) | |
oklens = (1:max_len)[ n <= 2^max_len*(1-2^(-(1:max_len)))+1 ] | |
okinds = unlist(inds_by_g[oklens]) | |
mysamp = rep(0,n) | |
for (i in 1:n){ | |
id = if (length(okinds)==1) okinds else sample(okinds,1) | |
g = findInterval(id,cuts) | |
gid = id-cuts[g]+1 | |
nixed = get_nixstrs(g,gid,max_len) | |
# print(id); print(okinds); print(nixed) | |
mysamp[i] = id | |
okinds = setdiff(okinds,nixed) | |
if (!length(okinds)) break | |
} | |
res <- rep("",n) | |
res[seq.int(i)] <- get_01str(mysamp[seq.int(i)],max_len) | |
res | |
} | |
frank <- function(size, n) drawem(n, size) | |
#------------------------------------------------------------------------------- | |
library(stringr) | |
tree_sample <- function(samples,pool, max_len) { | |
results <- vector("integer",samples) | |
# Will be used on a regular basis, compute it in advance | |
PoolLen <- str_length(pool) | |
# Make a mask vector based on the length of each entry of the pool | |
masks <- strtoi(str_pad(str_pad("1",PoolLen,"right","1"),max_len,"right","0"),base=2) | |
# Make an integer vector from "0" right padded orignal: for max_len=4 and pool entry "1" we get "1000" => 8 | |
# This will allow to find this entry as parent of 10 and 11 which become "1000" and "1100", as integer 8 and 12 respectively | |
# once bitwise "anded" with the repective mask "1000" the first bit is striclty the same, so it's a parent. | |
integerPool <- strtoi(str_pad(pool,max_len,"right","0"),base=2) | |
# Create a vector to filter the available value to sample | |
ok <- rep(TRUE,length(pool)) | |
#Precompute the result of the bitwise and betwwen our integer pool and the masks | |
MaskedPool <- bitwAnd(integerPool,masks) | |
while(samples) { | |
samp <- sample(pool[ok],1) # Get a sample | |
results[samples] <- samp # Store it as result | |
ok[pool == samp] <- FALSE # Remove it from available entries | |
vsamp <- strtoi(str_pad(samp,max_len,"right","0"),base=2) # Get the integer value of the "0" right padded sample | |
mlen <- str_length(samp) # Get sample len | |
#Creation of unitary mask to remove childs of sample | |
mask <- strtoi(paste0(rep(1:0,c(mlen,max_len-mlen)),collapse=""),base=2) | |
# Get the result of bitwise And between the integerPool and the sample mask | |
FilterVec <- bitwAnd(integerPool,mask) | |
# Get the bitwise and result of the sample and it's mask | |
Childm <- bitwAnd(vsamp,mask) | |
ok[FilterVec == Childm] <- FALSE # Remove from available entries the childs of the sample | |
ok[MaskedPool == bitwAnd(vsamp,masks)] <- FALSE # compare the sample with all the masks to remove parents matching | |
samples <- samples -1 | |
if(!sum(ok)) break | |
} | |
results | |
} | |
tensibai <- function(size, n) { | |
tree_sample(n,pools[[as.character(size)]], size) | |
} | |
#------------------------------------------------------------------------------- | |
library(lpSolve) | |
sample.lp <- function(pool, max_len) { | |
pool <- sort(pool) | |
pml <- max(nchar(pool)) | |
runs <- c(rev(cumsum(2^(seq(pml-1)))), 0) | |
banned.from <- rep(seq(pool), runs[nchar(pool)]) | |
banned.to <- banned.from + unlist(lapply(runs[nchar(pool)], seq_len)) | |
banned.constr <- matrix(0, nrow=length(banned.from), ncol=length(pool)) | |
banned.constr[cbind(seq(banned.from), banned.from)] <- 1 | |
banned.constr[cbind(seq(banned.to), banned.to)] <- 1 | |
mod <- lp(direction="max", | |
objective.in=runif(length(pool)), | |
const.mat=rbind(banned.constr, rep(1, length(pool))), | |
const.dir=c(rep("<=", length(banned.from)), "=="), | |
const.rhs=c(rep(1, length(banned.from)), max_len), | |
all.bin=TRUE) | |
pool[which(mod$solution == 1)] | |
} | |
josilber <- function(size, n) sample.lp(pools[[as.character(size)]], n) | |
# = Run benchmarks =================================================== | |
library(functional) | |
stop("Remember to load the brodie funs") | |
funs <- c( | |
jbaum=jbaum, josilber=josilber, frank=frank, tensibai=tensibai, | |
brodie.b=sample0110b, | |
brodie=Curry(sample0110, complete.only=TRUE), | |
brodie.C=Curry(sample01101, complete.only=TRUE), | |
brodie.str=function(size, n) sample01(pools[[as.character(size)]], n) | |
) | |
sizes <- c(4, 8, 16) | |
ns <- c(10, 50, 100, 256, 1000) | |
res <- expand.grid(fun=names(funs), size=sizes, n=ns) | |
res <- subset( | |
res, ( | |
fun %in% c("frank", "tensibai", "brodie.b", "brodie", "brodie.str", "brodie.cpl", "brodie.C", "brodie.C.cpl") | | |
size <= 8 | |
) & ! ((fun == "josilber" & n > 10) | (fun == "jbaum" & n > 100)) | |
) | |
res$val <- NA_integer_ | |
library(microbenchmark) | |
for(i in 1:nrow(res)) { | |
cat(sprintf("Running %s size: %d n: %d\n", res[i, 1], res[i, 2], res[i, 3])) | |
bm <- try( | |
microbenchmark(funs[[res[i, 1]]](res[i, 2], res[i, 3]), times=3)$time[[2]] | |
) | |
val <- if(inherits(bm, "try-error")) NA else bm | |
res[i, 4] <- val | |
} | |
dcast(transform(res, val=format(round(val/1e6), big.mark=",")), size + n ~ fun, value.var="val") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment