brodieG/bench0101.R

## bench0101.R

make_pool <- function(size)
  sort(
    unlist(
      lapply(
        seq_len(size),
        function(x) do.call(paste0, expand.grid(rep(list(c('0', '1')), x)))
  ) ) )

system.time(pool4 <- make_pool(4))
system.time(pool8 <- make_pool(8))
system.time(pool16 <- make_pool(16))

pools <- list("4"=pool4, "8"=pool8, "16"=pool16)

# = All Funs with a `size`, `n` interface ======================================

#-------------------------------------------------------------------------------

jbaum <- function(size, n) {
  pool <- pools[[as.character(size)]]
  chosen <- sample(pool, n)
  while(any(rowSums(outer(paste0('^', chosen), chosen, Vectorize(grepl))) > 1)) {
    prefixes <- rowSums(outer(paste0('^', chosen), chosen, Vectorize(grepl))) > 1
    pool <- pool[rowSums(Vectorize(grepl, 'pattern')(
      paste0('^', chosen[!prefixes]), pool)) == 0]
    chosen <- c(chosen[!prefixes], sample(pool, sum(prefixes)))
  }
  chosen
}
#-------------------------------------------------------------------------------

library(data.table)
get_01str <- function(id,max_len){
    cuts = cumsum(2^(1:max_len-1))
    g    = findInterval(id,cuts)
    gid  = id-cuts[g]+1

    data.table(g,gid)[,s:=
      do.call(paste,c(list(sep=""),lapply(
        seq(g[1]),
        function(x) (gid-1) %/% 2^(x-1) %% 2
      )))
    ,by=g]$s
}
get_nixstrs <- function(g,gid,max_len){

    cuts         = cumsum(2^(1:max_len-1))
    gids_child   = {
      x = gid%%2^sequence(g-1)
      ifelse(x,x,2^sequence(g-1))
    }
    ids_child    = gids_child+cuts[sequence(g-1)]-1

    ids_parent   = if (g==max_len) gid+cuts[g]-1 else {

      gids_par       = vector(mode="list",max_len)
      gids_par[[g]]  = gid
      for (gg in seq(g,max_len-1))
        gids_par[[gg+1]] = c(gids_par[[gg]],gids_par[[gg]]+2^gg)

      unlist(mapply(`+`,gids_par,cuts-1))
    }

    c(ids_child,ids_parent)
}
drawem <- function(n,max_len){
    cuts        = cumsum(2^(1:max_len-1))
    inds_by_g   = mapply(seq,cuts,cuts*2)

    oklens = (1:max_len)[ n <= 2^max_len*(1-2^(-(1:max_len)))+1 ]
    okinds = unlist(inds_by_g[oklens])

    mysamp = rep(0,n)
    for (i in 1:n){

        id        = if (length(okinds)==1) okinds else sample(okinds,1)
        g         = findInterval(id,cuts)
        gid       = id-cuts[g]+1
        nixed     = get_nixstrs(g,gid,max_len)

        # print(id); print(okinds); print(nixed)

        mysamp[i] = id
        okinds    = setdiff(okinds,nixed)
        if (!length(okinds)) break
    }

    res <- rep("",n)
    res[seq.int(i)] <- get_01str(mysamp[seq.int(i)],max_len)
    res
}
frank <- function(size, n) drawem(n, size)

#-------------------------------------------------------------------------------

library(stringr)
tree_sample <- function(samples,pool, max_len) {
  results <- vector("integer",samples)
  # Will be used on a regular basis, compute it in advance
  PoolLen <- str_length(pool)
  # Make a mask vector based on the length of each entry of the pool
  masks <- strtoi(str_pad(str_pad("1",PoolLen,"right","1"),max_len,"right","0"),base=2)

  # Make an integer vector from "0" right padded orignal: for max_len=4 and pool entry "1" we get "1000" => 8
  # This will allow to find this entry as parent of 10 and 11 which become "1000" and "1100", as integer 8 and 12 respectively
  # once bitwise "anded" with the repective mask "1000" the first bit is striclty the same, so it's a parent.
  integerPool <- strtoi(str_pad(pool,max_len,"right","0"),base=2)

  # Create a vector to filter the available value to sample
  ok <- rep(TRUE,length(pool))

  #Precompute the result of the bitwise and betwwen our integer pool and the masks
  MaskedPool <- bitwAnd(integerPool,masks)

  while(samples) {
    samp <- sample(pool[ok],1) # Get a sample
    results[samples] <- samp # Store it as result
    ok[pool == samp] <- FALSE # Remove it from available entries

    vsamp <- strtoi(str_pad(samp,max_len,"right","0"),base=2) # Get the integer value of the "0" right padded sample
    mlen <- str_length(samp) # Get sample len

    #Creation of unitary mask to remove childs of sample
    mask <- strtoi(paste0(rep(1:0,c(mlen,max_len-mlen)),collapse=""),base=2)

    # Get the result of bitwise And between the integerPool and the sample mask
    FilterVec <- bitwAnd(integerPool,mask)

    # Get the bitwise and result of the sample and it's mask
    Childm <- bitwAnd(vsamp,mask)

    ok[FilterVec == Childm] <- FALSE  # Remove from available entries the childs of the sample
    ok[MaskedPool == bitwAnd(vsamp,masks)] <- FALSE # compare the sample with all the masks to remove parents matching

    samples <- samples -1
    if(!sum(ok)) break
  }
  results
}
tensibai <- function(size, n) {
  tree_sample(n,pools[[as.character(size)]], size)
}

#-------------------------------------------------------------------------------

library(lpSolve)
sample.lp <- function(pool, max_len) {
  pool <- sort(pool)
  pml <- max(nchar(pool))
  runs <- c(rev(cumsum(2^(seq(pml-1)))), 0)
  banned.from <- rep(seq(pool), runs[nchar(pool)])
  banned.to <- banned.from + unlist(lapply(runs[nchar(pool)], seq_len))
  banned.constr <- matrix(0, nrow=length(banned.from), ncol=length(pool))
  banned.constr[cbind(seq(banned.from), banned.from)] <- 1
  banned.constr[cbind(seq(banned.to), banned.to)] <- 1
  mod <- lp(direction="max",
            objective.in=runif(length(pool)),
            const.mat=rbind(banned.constr, rep(1, length(pool))),
            const.dir=c(rep("<=", length(banned.from)), "=="),
            const.rhs=c(rep(1, length(banned.from)), max_len),
            all.bin=TRUE)
  pool[which(mod$solution == 1)]
}
josilber <- function(size, n) sample.lp(pools[[as.character(size)]], n)

# = Run benchmarks ===================================================

library(functional)
stop("Remember to load the brodie funs")
funs <- c(
  jbaum=jbaum, josilber=josilber, frank=frank, tensibai=tensibai,
  brodie.b=sample0110b,
  brodie=Curry(sample0110, complete.only=TRUE),
  brodie.C=Curry(sample01101, complete.only=TRUE),
  brodie.str=function(size, n) sample01(pools[[as.character(size)]], n)
)
sizes <- c(4, 8, 16)
ns <- c(10, 50, 100, 256, 1000)

res <- expand.grid(fun=names(funs), size=sizes, n=ns)
res <- subset(
  res, (
    fun %in% c("frank", "tensibai", "brodie.b", "brodie", "brodie.str", "brodie.cpl", "brodie.C", "brodie.C.cpl") |
    size <= 8
  ) & ! ((fun == "josilber" & n > 10) | (fun == "jbaum" & n > 100))
)
res$val <- NA_integer_
library(microbenchmark)

for(i in 1:nrow(res)) {
 cat(sprintf("Running %s size: %d n: %d\n", res[i, 1], res[i, 2], res[i, 3]))
  bm <- try(
    microbenchmark(funs[[res[i, 1]]](res[i, 2], res[i, 3]), times=3)$time[[2]]
  )
  val <- if(inherits(bm, "try-error")) NA else bm
  res[i, 4] <- val
}

dcast(transform(res, val=format(round(val/1e6), big.mark=",")), size + n ~ fun, value.var="val")

	make_pool <- function(size)
	sort(
	unlist(
	lapply(
	seq_len(size),
	function(x) do.call(paste0, expand.grid(rep(list(c('0', '1')), x)))
	) ) )

	system.time(pool4 <- make_pool(4))
	system.time(pool8 <- make_pool(8))
	system.time(pool16 <- make_pool(16))

	pools <- list("4"=pool4, "8"=pool8, "16"=pool16)

	# = All Funs with a `size`, `n` interface ======================================

	#-------------------------------------------------------------------------------

	jbaum <- function(size, n) {
	pool <- pools[[as.character(size)]]
	chosen <- sample(pool, n)
	while(any(rowSums(outer(paste0('^', chosen), chosen, Vectorize(grepl))) > 1)) {
	prefixes <- rowSums(outer(paste0('^', chosen), chosen, Vectorize(grepl))) > 1
	pool <- pool[rowSums(Vectorize(grepl, 'pattern')(
	paste0('^', chosen[!prefixes]), pool)) == 0]
	chosen <- c(chosen[!prefixes], sample(pool, sum(prefixes)))
	}
	chosen
	}
	#-------------------------------------------------------------------------------

	library(data.table)
	get_01str <- function(id,max_len){
	cuts = cumsum(2^(1:max_len-1))
	g = findInterval(id,cuts)
	gid = id-cuts[g]+1

	data.table(g,gid)[,s:=
	do.call(paste,c(list(sep=""),lapply(
	seq(g[1]),
	function(x) (gid-1) %/% 2^(x-1) %% 2
	)))
	,by=g]$s
	}
	get_nixstrs <- function(g,gid,max_len){

	cuts = cumsum(2^(1:max_len-1))
	gids_child = {
	x = gid%%2^sequence(g-1)
	ifelse(x,x,2^sequence(g-1))
	}
	ids_child = gids_child+cuts[sequence(g-1)]-1

	ids_parent = if (g==max_len) gid+cuts[g]-1 else {

	gids_par = vector(mode="list",max_len)
	gids_par[[g]] = gid
	for (gg in seq(g,max_len-1))
	gids_par[[gg+1]] = c(gids_par[[gg]],gids_par[[gg]]+2^gg)

	unlist(mapply(`+`,gids_par,cuts-1))
	}

	c(ids_child,ids_parent)
	}
	drawem <- function(n,max_len){
	cuts = cumsum(2^(1:max_len-1))
	inds_by_g = mapply(seq,cuts,cuts*2)

	oklens = (1:max_len)[ n <= 2^max_len*(1-2^(-(1:max_len)))+1 ]
	okinds = unlist(inds_by_g[oklens])

	mysamp = rep(0,n)
	for (i in 1:n){

	id = if (length(okinds)==1) okinds else sample(okinds,1)
	g = findInterval(id,cuts)
	gid = id-cuts[g]+1
	nixed = get_nixstrs(g,gid,max_len)

	# print(id); print(okinds); print(nixed)

	mysamp[i] = id
	okinds = setdiff(okinds,nixed)
	if (!length(okinds)) break
	}

	res <- rep("",n)
	res[seq.int(i)] <- get_01str(mysamp[seq.int(i)],max_len)
	res
	}
	frank <- function(size, n) drawem(n, size)

	#-------------------------------------------------------------------------------

	library(stringr)
	tree_sample <- function(samples,pool, max_len) {
	results <- vector("integer",samples)
	# Will be used on a regular basis, compute it in advance
	PoolLen <- str_length(pool)
	# Make a mask vector based on the length of each entry of the pool
	masks <- strtoi(str_pad(str_pad("1",PoolLen,"right","1"),max_len,"right","0"),base=2)

	# Make an integer vector from "0" right padded orignal: for max_len=4 and pool entry "1" we get "1000" => 8
	# This will allow to find this entry as parent of 10 and 11 which become "1000" and "1100", as integer 8 and 12 respectively
	# once bitwise "anded" with the repective mask "1000" the first bit is striclty the same, so it's a parent.
	integerPool <- strtoi(str_pad(pool,max_len,"right","0"),base=2)

	# Create a vector to filter the available value to sample
	ok <- rep(TRUE,length(pool))

	#Precompute the result of the bitwise and betwwen our integer pool and the masks
	MaskedPool <- bitwAnd(integerPool,masks)

	while(samples) {
	samp <- sample(pool[ok],1) # Get a sample
	results[samples] <- samp # Store it as result
	ok[pool == samp] <- FALSE # Remove it from available entries

	vsamp <- strtoi(str_pad(samp,max_len,"right","0"),base=2) # Get the integer value of the "0" right padded sample
	mlen <- str_length(samp) # Get sample len

	#Creation of unitary mask to remove childs of sample
	mask <- strtoi(paste0(rep(1:0,c(mlen,max_len-mlen)),collapse=""),base=2)

	# Get the result of bitwise And between the integerPool and the sample mask
	FilterVec <- bitwAnd(integerPool,mask)

	# Get the bitwise and result of the sample and it's mask
	Childm <- bitwAnd(vsamp,mask)

	ok[FilterVec == Childm] <- FALSE # Remove from available entries the childs of the sample
	ok[MaskedPool == bitwAnd(vsamp,masks)] <- FALSE # compare the sample with all the masks to remove parents matching

	samples <- samples -1
	if(!sum(ok)) break
	}
	results
	}
	tensibai <- function(size, n) {
	tree_sample(n,pools[[as.character(size)]], size)
	}

	#-------------------------------------------------------------------------------

	library(lpSolve)
	sample.lp <- function(pool, max_len) {
	pool <- sort(pool)
	pml <- max(nchar(pool))
	runs <- c(rev(cumsum(2^(seq(pml-1)))), 0)
	banned.from <- rep(seq(pool), runs[nchar(pool)])
	banned.to <- banned.from + unlist(lapply(runs[nchar(pool)], seq_len))
	banned.constr <- matrix(0, nrow=length(banned.from), ncol=length(pool))
	banned.constr[cbind(seq(banned.from), banned.from)] <- 1
	banned.constr[cbind(seq(banned.to), banned.to)] <- 1
	mod <- lp(direction="max",
	objective.in=runif(length(pool)),
	const.mat=rbind(banned.constr, rep(1, length(pool))),
	const.dir=c(rep("<=", length(banned.from)), "=="),
	const.rhs=c(rep(1, length(banned.from)), max_len),
	all.bin=TRUE)
	pool[which(mod$solution == 1)]
	}
	josilber <- function(size, n) sample.lp(pools[[as.character(size)]], n)

	# = Run benchmarks ===================================================

	library(functional)
	stop("Remember to load the brodie funs")
	funs <- c(
	jbaum=jbaum, josilber=josilber, frank=frank, tensibai=tensibai,
	brodie.b=sample0110b,
	brodie=Curry(sample0110, complete.only=TRUE),
	brodie.C=Curry(sample01101, complete.only=TRUE),
	brodie.str=function(size, n) sample01(pools[[as.character(size)]], n)
	)
	sizes <- c(4, 8, 16)
	ns <- c(10, 50, 100, 256, 1000)

	res <- expand.grid(fun=names(funs), size=sizes, n=ns)
	res <- subset(
	res, (
	fun %in% c("frank", "tensibai", "brodie.b", "brodie", "brodie.str", "brodie.cpl", "brodie.C", "brodie.C.cpl") \|
	size <= 8
	) & ! ((fun == "josilber" & n > 10) \| (fun == "jbaum" & n > 100))
	)
	res$val <- NA_integer_
	library(microbenchmark)

	for(i in 1:nrow(res)) {
	cat(sprintf("Running %s size: %d n: %d\n", res[i, 1], res[i, 2], res[i, 3]))
	bm <- try(
	microbenchmark(funs[[res[i, 1]]](res[i, 2], res[i, 3]), times=3)$time[[2]]
	)
	val <- if(inherits(bm, "try-error")) NA else bm
	res[i, 4] <- val
	}

	dcast(transform(res, val=format(round(val/1e6), big.mark=",")), size + n ~ fun, value.var="val")