Skip to content

Instantly share code, notes, and snippets.

@pkhuong
Last active September 25, 2020 21:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pkhuong/68365b3f52a15f5cb652344b0f35cfe9 to your computer and use it in GitHub Desktop.
Save pkhuong/68365b3f52a15f5cb652344b0f35cfe9 to your computer and use it in GitHub Desktop.
Numerical implementation of Beyer et al's confidence intervals for k min values
"""Numerical implementation of Beyer et al's confidence intervals for k min values
[On Synopses for Distinct-Value Estimation Under Multiset Operations](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.230.7008&rep=rep1&type=pdf#page=6), Section 4.3.
ci_probability(k, eps) computes the probability that the unbiased
cardinality estimate
\hat{D}_k^{UB} = (k - 1) / U(k),
where k is the number of minimum values we keep, and U(k) is the
maximum hash value in that list, normalised to [0, 1]. If we saw
fewer than k distinct values, we can simply report the exact value.
Let D be the actual cardinality. If ci_probability(k, eps) = delta,
we know that
P(|\hat{D}_k^{UB} - D| / D <= eps) >= delta,
for any number of values and cardinality D: the asymptotic bound
derived in the reference paper is pessimistic, but still correct,
for finite cardinalities and value counts.
We can also invert that function and instead find eps, given a fixed
sketch size k and coverage probability delta. That's what
`eps_search` implements.
ci_probability was validated by comparing with Figure 1 in Beyer et
al's paper, and with the example "delta = 0.95, k = 1024, eps ~=
0.06127."
"""
from mpmath import mp, mpf
def ibterm(k, pm_eps, log_scale):
"""Computes
exp(log_scale) [ 1 + \sum_{i = 1}^{k - 1} (k - 1)^i / ((1 + pm_eps)^i i!) ],
when pm_eps is +eps or -eps.
This is a product of a very small (exp(log_scale)) value and a large
sum. In order to work in log space, we distribute to instead sum
better behaved products that can be evaluated in log space:
exp(log_scale)
+ \sum_{i=1}^{k - 1} exp(log_scale) (k - 1)^i / ((1 + pm_eps)^i i!)
Every component of the inner product is trivial to compute in log
space; we simply update the factorial for each iteration of the
sum.
"""
pm_eps = mpf(pm_eps)
log_scale = mpf(log_scale)
log_km1 = mp.log(k - 1) # log(k - 1)
log_1pme = mp.log(1 + pm_eps) # log(1 + pm_eps)
fac_acc = mpf(0) # Accumulator for i! in log.
acc = mp.exp(log_scale)
for i in range(1, k):
fac_acc += mp.log(i)
acc += mp.exp(log_scale + i * (log_km1 - log_1pme) - fac_acc)
return acc
def ci_probability(k, eps):
"""Computes the probability that a k min value sketch has a relative
error of eps or less (see S 4.3 of
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.230.7008&rep=rep1&type=pdf#page=6)."""
eps = mpf(eps)
pos_term = ibterm(k, eps, -(k - 1) / (1 + eps))
neg_term = ibterm(k, -eps, -(k - 1) / (1 - eps))
return pos_term - neg_term
def eps_search(k, p, min_eps=1e-5, max_eps=1 - 1e-5):
"""Binary searches for the minimum eps such that a k min value sketch
has a relative error eps or less with probability at least p."""
for _ in range(64):
mid = (min_eps + max_eps) / 2
# the CI is too aggressive.
if ci_probability(k, mid) < p:
min_eps = mid
else:
max_eps = mid
return max_eps
mp.dps = 200
sizes = sorted(
list(
{2 ** i for i in range(8, 16)}
| {100 * i for i in range(3, 10)}
| {1000 * i for i in range(1, 21)}
)
)
for k in sizes:
print((k, eps_search(k, 1 - mpf(1e-20))))
# In [40]: for k in sizes:
# ...: print((k, eps_search(k, 1 - mpf(1e-20))))
# ...:
# (256, 0.8933010437798387)
# (300, 0.795063354117913)
# (400, 0.649231171891936)
# (500, 0.5584635478511522)
# (512, 0.5497594311001835)
# (600, 0.49560051748505624)
# (700, 0.44901120757738205)
# (800, 0.4128276319843038)
# (900, 0.3837456093057503)
# (1000, 0.3597514885971114)
# (1024, 0.35459489223872503)
# (2000, 0.23886654949510666)
# (2048, 0.23563644019592284)
# (3000, 0.18981268452153102)
# (4000, 0.16177921662153935)
# (4096, 0.15967878445409936)
# (5000, 0.1431446307946877)
# (6000, 0.12964041665307863)
# (7000, 0.11928942625624793)
# (8000, 0.11103632459490151)
# (8192, 0.10963525526984166)
# (9000, 0.10426065031951647)
# (10000, 0.09857100245899354)
# (11000, 0.09370684955379038)
# (12000, 0.08948726223117043)
# (13000, 0.08578220293734226)
# (14000, 0.0824955198497288)
# (15000, 0.07955439905541617)
# (16000, 0.07690256574667127)
# (16384, 0.07595147059050668)
# (17000, 0.07449575639286388)
# (18000, 0.07229861740425302)
# (19000, 0.07028252833983045)
# (20000, 0.068424040928028)
# (32768, 0.052942429149847224)
# In [41]: for k in sizes:
# ...: print((k, eps_search(k, 1 - mpf(2**-70))))
# ...:
# (256, 0.9314607985943795)
# (300, 0.8279129506378315)
# (400, 0.6746537447821473)
# (500, 0.5795522484509731)
# (512, 0.5704446214577652)
# (600, 0.5138236681150362)
# (700, 0.4651847660519782)
# (800, 0.42745398481359465)
# (900, 0.397157233270994)
# (1000, 0.3721805371190448)
# (1024, 0.3668151163871019)
# (2000, 0.24662395085228275)
# (2048, 0.2432755689900746)
# (3000, 0.19581191703669742)
# (4000, 0.16681072702452937)
# (4096, 0.16463888967344198)
# (5000, 0.14754806648706512)
# (6000, 0.1335963734526734)
# (7000, 0.12290679695227132)
# (8000, 0.1143864907519984)
# (8192, 0.11294030406681277)
# (9000, 0.1073932845831681)
# (10000, 0.10152225768879311)
# (11000, 0.09650397542794534)
# (12000, 0.09215138190015283)
# (13000, 0.08833007115386277)
# (14000, 0.08494068379081839)
# (15000, 0.08190799189738561)
# (16000, 0.07917386534949619)
# (16384, 0.07819331779778638)
# (17000, 0.07669258863232875)
# (18000, 0.07442765375699807)
# (19000, 0.07234950966446299)
# (20000, 0.07043394859402075)
# (32768, 0.05448167268478741)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment