Last active
September 14, 2018 20:37
-
-
Save alexshpilkin/eff8e98033cd8dadc7605908c2f71996 to your computer and use it in GitHub Desktop.
Sample binomial distributions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.TH BETA 1 2018 "Alexander Shpilkin" | |
.SH NAME | |
beta \- sample binomial distributions | |
.SH SYNOPSIS | |
\fBbeta \fIcount | |
.SH DESCRIPTION | |
Given a population size \fIN\fR and a sample \fIn\fR from a binomial | |
distribution with parameters (\fIN\fR, \fIp\fR), \fIbeta\fR computes | |
\fIcount\fR more samples from it, assuming a uniform Bayesian prior for the | |
unknown parameter \fIp\fR. | |
.PP | |
The values of \fIN\fR and \fIn\fR are read from standard input, in this order, | |
separated by whitespace. Several distributions can be specified, one per line. | |
For each distribution, \fIcount\fR lines are written, containing the population | |
size \fIN\fR and the sampled value. Values on output are separated by a single | |
tab character. | |
.SH LIMITATIONS | |
The population size \fIN\fR is limited to about 30000, which is also the range | |
of values for which \fIbeta\fR can be expected to produce reasonably accurate | |
results with acceptable speed. | |
.SH AUTHOR | |
Alexander Shpilkin <ashpilkin@gmail.com> | |
.SH SEE ALSO | |
.IR hist (1), | |
.IR cut (1), | |
.IR awk (1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <assert.h> | |
#include <errno.h> | |
#include <float.h> | |
#include <math.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
/* BSD sysexits.h */ | |
#define EX_USAGE 64 | |
#define EX_DATAERR 65 | |
#define EX_IOERR 74 | |
#if defined(__GNUC__) || defined(__icc__) || defined(__clang__) | |
# define unlikely(C) (__builtin_expect(!!(C), 0)) | |
#else | |
# define unlikely(C) (C) | |
#endif | |
static uint64_t randu = UINT64_C(0x70736575646F7261), /* pseudora */ | |
randv = UINT64_C(0x6E646F6D73656564); /* ndomseed */ | |
static inline uint64_t xor128p(void) { | |
/* xorshift128+ by Sebastiano Vigna, arXiv:1404.0390v3 [cs.DS] */ | |
/* Period 2^128 - 1, 1-dimensional equidistribution, 8-byte state */ | |
assert(randu != 0 || randv != 0); | |
uint64_t u = randv, v = randu, r = u + v; | |
randu = u; | |
v ^= v << 23; | |
randv = v ^ u ^ (v >> 18) ^ (u >> 5); | |
return r; | |
} | |
static inline double uniform(void) { | |
return (xor128p() >> 1) * 0x1p-63; | |
} | |
#define SIZE 32767 | |
static unsigned size; /* 0 <= sample value <= size < SIZE */ | |
static double cdf[SIZE+1]; /* cdf[n] = sum(0 <= k < n) Pr(value = k) */ | |
/* cdf[size+1] = 1.0 (not stored) */ | |
#define EXPSTEP (DBL_MAX_EXP / 2) | |
static void prepare(unsigned m) { | |
assert(m <= size && size < SIZE); | |
static int eov[SIZE+1]; | |
const unsigned l = size - m; | |
double p = 1.0, a = 0.0; int e = 0; | |
for (unsigned n = 0; n < size + 1; n++) { | |
assert(isfinite(a) && isfinite(p)); | |
cdf[n] = a; eov[n] = e; | |
double f = ((double)(m + n + 1) / (n + 1)) * | |
((double)(size - n) / (l + size - n)); | |
if unlikely (isinf(a + p) || isinf(p * f)) { | |
e += EXPSTEP; | |
p = scalbn(p, -EXPSTEP); | |
a = scalbn(a, -EXPSTEP); | |
} | |
a += p; | |
p *= f; | |
} | |
if unlikely (e != 0) { | |
for (unsigned n = 0; n < size + 1; n++) | |
cdf[n] = scalbn(cdf[n], eov[n] - e) / a; | |
} else { | |
for (unsigned n = 0; n < size + 1; n++) | |
cdf[n] = cdf[n] / a; | |
} | |
#ifndef NDEBUG | |
for (unsigned n = 0; n < size; n++) | |
assert(isfinite(cdf[n])); | |
for (unsigned n = 0; n < size; n++) | |
assert(cdf[n] <= cdf[n+1]); | |
assert(cdf[size+1] <= 1.0); | |
double maxp = m < size ? cdf[m+1] - cdf[m] : 1.0 - cdf[m]; | |
for (unsigned n = 0; n < size; n++) | |
assert(cdf[n+1] - cdf[n] <= maxp); | |
#endif | |
} | |
static inline unsigned sample(void) { | |
double u = uniform(); | |
unsigned lo = 0, hi = size + 1; | |
/* Loop invariant: cdf[lo] <= u < cdf[hi] */ | |
while (lo < hi - 1) { | |
unsigned mi = (hi + lo) / 2; | |
if (u < cdf[mi]) | |
hi = mi; | |
else | |
lo = mi; | |
} | |
return lo; | |
} | |
int main(int argc, char **argv) { | |
unsigned long count; | |
char *end; | |
if (argc != 2 || (count = strtoul(argv[1], &end, 0), *end != '\0')) { | |
fprintf(stderr, "%s: incorrect arguments\n", argv[0]); | |
return EX_USAGE; | |
} | |
unsigned m; | |
int read; | |
while ((errno = 0, read = scanf("%u %u", &size, &m)) == 2) { | |
if unlikely (size >= SIZE) { | |
fprintf(stderr, "%s: size too large [maximum=%u]", argv[0], SIZE); | |
return EX_DATAERR; | |
} | |
if unlikely (m >= SIZE) { | |
fprintf(stderr, "%s: sample value larger than size", argv[0]); | |
return EX_DATAERR; | |
} | |
prepare(m); | |
for (unsigned long n = 0; n < count; n++) | |
printf("%u\t%u\n", size, sample()); | |
fflush(stdout); | |
} | |
if (errno != 0) { | |
int status = (errno == EILSEQ ? EX_DATAERR : EX_IOERR); | |
perror(argv[0]); return status; | |
} else if (read != EOF) { | |
return EX_DATAERR; | |
} | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
from os.path import dirname, join, realpath, splitext | |
from subprocess import Popen, PIPE | |
from sys import executable as python, getdefaultencoding | |
executable=join(dirname(realpath(__file__)), 'beta' + splitext(python)[1]) | |
class Sampler: | |
def __init__(self, count): | |
self._count = count | |
self._child = Popen([executable, str(count)], stdin=PIPE, stdout=PIPE, | |
bufsize=1, universal_newlines=True, | |
encoding=getdefaultencoding()) | |
def __enter__(self): | |
return self | |
def __exit__(self, *details): | |
self.close() | |
def sample(self, pop, obs): | |
print(pop, obs, file=self._child.stdin) | |
for i in range(self._count): | |
_pop, sam = self._child.stdout.readline().split() | |
assert int(_pop) == pop | |
yield sam | |
def close(self): | |
self._child.stdin.close() | |
self._child.wait() | |
del self._child | |
if __name__ == '__main__': | |
DATA = [(100,3), (20,10)] | |
COUNT = 3 | |
# either terminate the subprocess manually: | |
sam = Sampler(COUNT) | |
for pop, obs in DATA: | |
for i, val in enumerate(sam.sample(pop, obs)): | |
# just emulate the output of binom(1) itself | |
print(i, pop, val) | |
sam.close() | |
# or use the context manager: | |
with Sampler(COUNT) as sam: | |
for pop, obs in DATA: | |
for i, val in enumerate(sam.sample(pop, obs)): | |
print(i, pop, val) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment