alexshpilkin/beta.1

## beta.1
.TH BETA 1 2018 "Alexander Shpilkin"
.SH NAME
beta \- sample binomial distributions
.SH SYNOPSIS
\fBbeta \fIcount
.SH DESCRIPTION
Given a population size \fIN\fR and a sample \fIn\fR from a binomial
distribution with parameters (\fIN\fR, \fIp\fR), \fIbeta\fR computes
\fIcount\fR more samples from it, assuming a uniform Bayesian prior for the
unknown parameter \fIp\fR.
.PP
The values of \fIN\fR and \fIn\fR are read from standard input, in this order,
separated by whitespace. Several distributions can be specified, one per line.
For each distribution, \fIcount\fR lines are written, containing the population
size \fIN\fR and the sampled value. Values on output are separated by a single
tab character.
.SH LIMITATIONS
The population size \fIN\fR is limited to about 30000, which is also the range
of values for which \fIbeta\fR can be expected to produce reasonably accurate
results with acceptable speed.
.SH AUTHOR
Alexander Shpilkin <ashpilkin@gmail.com>
.SH SEE ALSO
.IR hist (1),
.IR cut (1),
.IR awk (1)

## beta.c
#include <assert.h>
#include <errno.h>
#include <float.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

/* BSD sysexits.h */
#define EX_USAGE    64
#define EX_DATAERR  65
#define EX_IOERR    74

#if defined(__GNUC__) || defined(__icc__) || defined(__clang__)
# define unlikely(C) (__builtin_expect(!!(C), 0))
#else
# define unlikely(C) (C)
#endif

static uint64_t randu = UINT64_C(0x70736575646F7261), /* pseudora */
                randv = UINT64_C(0x6E646F6D73656564); /* ndomseed */

static inline uint64_t xor128p(void) {
    /* xorshift128+ by Sebastiano Vigna, arXiv:1404.0390v3 [cs.DS]    */
    /* Period 2^128 - 1, 1-dimensional equidistribution, 8-byte state */
    assert(randu != 0 || randv != 0);

    uint64_t u = randv, v = randu, r = u + v;
    randu = u;
    v ^= v << 23;
    randv = v ^ u ^ (v >> 18) ^ (u >> 5);
    return r;
}

static inline double uniform(void) {
    return (xor128p() >> 1) * 0x1p-63;
}

#define SIZE 32767
static unsigned size;         /* 0 <= sample value <= size < SIZE       */
static double   cdf[SIZE+1];  /* cdf[n] = sum(0 <= k < n) Pr(value = k) */
                              /* cdf[size+1] = 1.0 (not stored)         */

#define EXPSTEP (DBL_MAX_EXP / 2)

static void prepare(unsigned m) {
    assert(m <= size && size < SIZE);

    static int eov[SIZE+1];
    const unsigned l = size - m;
    double p = 1.0, a = 0.0; int e = 0;

    for (unsigned n = 0; n < size + 1; n++) {
        assert(isfinite(a) && isfinite(p));
        cdf[n] = a; eov[n] = e;

        double f = ((double)(m + n + 1) / (n + 1)) *
                   ((double)(size - n) / (l + size - n));
        if unlikely (isinf(a + p) || isinf(p * f)) {
            e += EXPSTEP;
            p  = scalbn(p, -EXPSTEP);
            a  = scalbn(a, -EXPSTEP);
        }
        a += p;
        p *= f;
    }

    if unlikely (e != 0) {
        for (unsigned n = 0; n < size + 1; n++)
            cdf[n] = scalbn(cdf[n], eov[n] - e) / a;
    } else {
        for (unsigned n = 0; n < size + 1; n++)
            cdf[n] = cdf[n] / a;
    }

#ifndef NDEBUG
    for (unsigned n = 0; n < size; n++)
        assert(isfinite(cdf[n]));
    for (unsigned n = 0; n < size; n++)
        assert(cdf[n] <= cdf[n+1]);
    assert(cdf[size+1] <= 1.0);
    double maxp = m < size ? cdf[m+1] - cdf[m] : 1.0 - cdf[m];
    for (unsigned n = 0; n < size; n++)
        assert(cdf[n+1] - cdf[n] <= maxp);
#endif
}

static inline unsigned sample(void) {
    double u = uniform();
    unsigned lo = 0, hi = size + 1;
    /* Loop invariant: cdf[lo] <= u < cdf[hi] */
    while (lo < hi - 1) {
        unsigned mi = (hi + lo) / 2;
        if (u < cdf[mi])
            hi = mi;
        else
            lo = mi;
    }
    return lo;
}

int main(int argc, char **argv) {
    unsigned long count;
    char *end;
    if (argc != 2 || (count = strtoul(argv[1], &end, 0), *end != '\0')) {
        fprintf(stderr, "%s: incorrect arguments\n", argv[0]);
        return EX_USAGE;
    }

    unsigned m;
    int read;
    while ((errno = 0, read = scanf("%u %u", &size, &m)) == 2) {
        if unlikely (size >= SIZE) {
            fprintf(stderr, "%s: size too large [maximum=%u]", argv[0], SIZE);
            return EX_DATAERR;
        }
        if unlikely (m >= SIZE) {
            fprintf(stderr, "%s: sample value larger than size", argv[0]);
            return EX_DATAERR;
        }

        prepare(m);
        for (unsigned long n = 0; n < count; n++)
            printf("%u\t%u\n", size, sample());
        fflush(stdout);
    }
    if (errno != 0) {
        int status = (errno == EILSEQ ? EX_DATAERR : EX_IOERR);
        perror(argv[0]); return status;
    } else if (read != EOF) {
        return EX_DATAERR;
    }

    return 0;
}

## beta.py
#!/usr/bin/python3
from os.path import dirname, join, realpath, splitext
from subprocess import Popen, PIPE
from sys import executable as python, getdefaultencoding

executable=join(dirname(realpath(__file__)), 'beta' + splitext(python)[1])

class Sampler:
    def __init__(self, count):
        self._count = count
        self._child = Popen([executable, str(count)], stdin=PIPE, stdout=PIPE,
                            bufsize=1, universal_newlines=True,
                            encoding=getdefaultencoding())
    def __enter__(self):
        return self
    def __exit__(self, *details):
        self.close()

    def sample(self, pop, obs):
        print(pop, obs, file=self._child.stdin)
        for i in range(self._count):
            _pop, sam = self._child.stdout.readline().split()
            assert int(_pop) == pop
            yield sam

    def close(self):
        self._child.stdin.close()
        self._child.wait()
        del self._child

if __name__ == '__main__':
    DATA  = [(100,3), (20,10)]
    COUNT = 3

    # either terminate the subprocess manually:
    sam = Sampler(COUNT)
    for pop, obs in DATA:
        for i, val in enumerate(sam.sample(pop, obs)):
            # just emulate the output of binom(1) itself
            print(i, pop, val)
    sam.close()

    # or use the context manager:
    with Sampler(COUNT) as sam:
        for pop, obs in DATA:
            for i, val in enumerate(sam.sample(pop, obs)):
                print(i, pop, val)
	.TH BETA 1 2018 "Alexander Shpilkin"
	.SH NAME
	beta \- sample binomial distributions
	.SH SYNOPSIS
	\fBbeta \fIcount
	.SH DESCRIPTION
	Given a population size \fIN\fR and a sample \fIn\fR from a binomial
	distribution with parameters (\fIN\fR, \fIp\fR), \fIbeta\fR computes
	\fIcount\fR more samples from it, assuming a uniform Bayesian prior for the
	unknown parameter \fIp\fR.
	.PP
	The values of \fIN\fR and \fIn\fR are read from standard input, in this order,
	separated by whitespace. Several distributions can be specified, one per line.
	For each distribution, \fIcount\fR lines are written, containing the population
	size \fIN\fR and the sampled value. Values on output are separated by a single
	tab character.
	.SH LIMITATIONS
	The population size \fIN\fR is limited to about 30000, which is also the range
	of values for which \fIbeta\fR can be expected to produce reasonably accurate
	results with acceptable speed.
	.SH AUTHOR
	Alexander Shpilkin <ashpilkin@gmail.com>
	.SH SEE ALSO
	.IR hist (1),
	.IR cut (1),
	.IR awk (1)
	#include <assert.h>
	#include <errno.h>
	#include <float.h>
	#include <math.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>

	/* BSD sysexits.h */
	#define EX_USAGE 64
	#define EX_DATAERR 65
	#define EX_IOERR 74

	#if defined(__GNUC__) \|\| defined(__icc__) \|\| defined(__clang__)
	# define unlikely(C) (__builtin_expect(!!(C), 0))
	#else
	# define unlikely(C) (C)
	#endif

	static uint64_t randu = UINT64_C(0x70736575646F7261), /* pseudora */
	randv = UINT64_C(0x6E646F6D73656564); /* ndomseed */

	static inline uint64_t xor128p(void) {
	/* xorshift128+ by Sebastiano Vigna, arXiv:1404.0390v3 [cs.DS] */
	/* Period 2^128 - 1, 1-dimensional equidistribution, 8-byte state */
	assert(randu != 0 \|\| randv != 0);

	uint64_t u = randv, v = randu, r = u + v;
	randu = u;
	v ^= v << 23;
	randv = v ^ u ^ (v >> 18) ^ (u >> 5);
	return r;
	}

	static inline double uniform(void) {
	return (xor128p() >> 1) * 0x1p-63;
	}

	#define SIZE 32767
	static unsigned size; /* 0 <= sample value <= size < SIZE */
	static double cdf[SIZE+1]; /* cdf[n] = sum(0 <= k < n) Pr(value = k) */
	/* cdf[size+1] = 1.0 (not stored) */

	#define EXPSTEP (DBL_MAX_EXP / 2)

	static void prepare(unsigned m) {
	assert(m <= size && size < SIZE);

	static int eov[SIZE+1];
	const unsigned l = size - m;
	double p = 1.0, a = 0.0; int e = 0;

	for (unsigned n = 0; n < size + 1; n++) {
	assert(isfinite(a) && isfinite(p));
	cdf[n] = a; eov[n] = e;

	double f = ((double)(m + n + 1) / (n + 1)) *
	((double)(size - n) / (l + size - n));
	if unlikely (isinf(a + p) \|\| isinf(p * f)) {
	e += EXPSTEP;
	p = scalbn(p, -EXPSTEP);
	a = scalbn(a, -EXPSTEP);
	}
	a += p;
	p *= f;
	}

	if unlikely (e != 0) {
	for (unsigned n = 0; n < size + 1; n++)
	cdf[n] = scalbn(cdf[n], eov[n] - e) / a;
	} else {
	for (unsigned n = 0; n < size + 1; n++)
	cdf[n] = cdf[n] / a;
	}

	#ifndef NDEBUG
	for (unsigned n = 0; n < size; n++)
	assert(isfinite(cdf[n]));
	for (unsigned n = 0; n < size; n++)
	assert(cdf[n] <= cdf[n+1]);
	assert(cdf[size+1] <= 1.0);
	double maxp = m < size ? cdf[m+1] - cdf[m] : 1.0 - cdf[m];
	for (unsigned n = 0; n < size; n++)
	assert(cdf[n+1] - cdf[n] <= maxp);
	#endif
	}

	static inline unsigned sample(void) {
	double u = uniform();
	unsigned lo = 0, hi = size + 1;
	/* Loop invariant: cdf[lo] <= u < cdf[hi] */
	while (lo < hi - 1) {
	unsigned mi = (hi + lo) / 2;
	if (u < cdf[mi])
	hi = mi;
	else
	lo = mi;
	}
	return lo;
	}

	int main(int argc, char **argv) {
	unsigned long count;
	char *end;
	if (argc != 2 \|\| (count = strtoul(argv[1], &end, 0), *end != '\0')) {
	fprintf(stderr, "%s: incorrect arguments\n", argv[0]);
	return EX_USAGE;
	}

	unsigned m;
	int read;
	while ((errno = 0, read = scanf("%u %u", &size, &m)) == 2) {
	if unlikely (size >= SIZE) {
	fprintf(stderr, "%s: size too large [maximum=%u]", argv[0], SIZE);
	return EX_DATAERR;
	}
	if unlikely (m >= SIZE) {
	fprintf(stderr, "%s: sample value larger than size", argv[0]);
	return EX_DATAERR;
	}

	prepare(m);
	for (unsigned long n = 0; n < count; n++)
	printf("%u\t%u\n", size, sample());
	fflush(stdout);
	}
	if (errno != 0) {
	int status = (errno == EILSEQ ? EX_DATAERR : EX_IOERR);
	perror(argv[0]); return status;
	} else if (read != EOF) {
	return EX_DATAERR;
	}

	return 0;
	}
	#!/usr/bin/python3
	from os.path import dirname, join, realpath, splitext
	from subprocess import Popen, PIPE
	from sys import executable as python, getdefaultencoding

	executable=join(dirname(realpath(__file__)), 'beta' + splitext(python)[1])

	class Sampler:
	def __init__(self, count):
	self._count = count
	self._child = Popen([executable, str(count)], stdin=PIPE, stdout=PIPE,
	bufsize=1, universal_newlines=True,
	encoding=getdefaultencoding())
	def __enter__(self):
	return self
	def __exit__(self, *details):
	self.close()

	def sample(self, pop, obs):
	print(pop, obs, file=self._child.stdin)
	for i in range(self._count):
	_pop, sam = self._child.stdout.readline().split()
	assert int(_pop) == pop
	yield sam

	def close(self):
	self._child.stdin.close()
	self._child.wait()
	del self._child

	if __name__ == '__main__':
	DATA = [(100,3), (20,10)]
	COUNT = 3

	# either terminate the subprocess manually:
	sam = Sampler(COUNT)
	for pop, obs in DATA:
	for i, val in enumerate(sam.sample(pop, obs)):
	# just emulate the output of binom(1) itself
	print(i, pop, val)
	sam.close()

	# or use the context manager:
	with Sampler(COUNT) as sam:
	for pop, obs in DATA:
	for i, val in enumerate(sam.sample(pop, obs)):
	print(i, pop, val)