Skip to content

Instantly share code, notes, and snippets.

@mkoehrsen
Last active August 30, 2015 12:53
Show Gist options
  • Save mkoehrsen/ce931b6462f4bb7835c7 to your computer and use it in GitHub Desktop.
Save mkoehrsen/ce931b6462f4bb7835c7 to your computer and use it in GitHub Desktop.
Generates the probability distribution describing the maximum number of leading zeroes seen in N random bit-strings, for successive values of N. This is related to the HyperLogLog cardinality estimator, as discussed at http://mkoehrsen.github.io/probability/data-analysis/2015/08/28/hyperloglog-estimator.html.
import numpy
def generate_max_zero_probabilities(maxZ):
maxZ += 1
row1 = numpy.array(list(map(lambda x: pow(2,-(x+1)),range(maxZ))))
row1_cum = row1.cumsum() - row1 # non-inclusive cumsum
yield (1,row1)
prev_row = row1
prev_row_cum = row1_cum
n = 2
while True:
new_row = prev_row_cum * row1 + prev_row * row1_cum + prev_row * row1
yield (n,new_row)
prev_row = new_row
prev_row_cum = prev_row.cumsum() - prev_row
n+=1
def print_table(rowgen,maxN):
(n,row) = next(rowgen)
while n < maxN:
print(" ".join(["%f"%x for x in row]))
(n,row) = next(rowgen)
def print_best_estimates_mle(rowgen,maxN):
(n,row) = next(rowgen)
best_rows = [(n,row) for z in range(len(row))]
while n < maxN:
(n,row) = next(rowgen)
for z in range(len(row)):
if row[z] > best_rows[z][1][z]:
best_rows[z] = (n,row)
elif n == best_rows[z][0] + 1:
print("z:",z,"bestn:",n-1,"probability:",best_rows[z][1][z])
# This line prints the best estimate of N for each value of Z. Edit the
# parameters to get more values of Z or N
print_best_estimates_mle(generate_max_zero_probabilities(20),10000)
# This line prints the full probability table up to maxZ and maxN
#print_table(generate_max_zero_probabilities(10),100)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment