Skip to content

Instantly share code, notes, and snippets.

@ept
Last active December 23, 2023 04:51
Show Gist options
  • Save ept/83b91aa07e2495c86ddd8c364a8cfbc7 to your computer and use it in GitHub Desktop.
Save ept/83b91aa07e2495c86ddd8c364a8cfbc7 to your computer and use it in GitHub Desktop.
False positive probability: Bloom filter vs. truncated hashes
set terminal png size 1100,400
set output 'false_pos.png'
set key left top
set logscale x
set xrange [1:10000]
set style line 1 linewidth 2.5 linecolor rgb '#4472C4'
set style line 2 linewidth 2.5 linecolor rgb '#ED7D31'
set xlabel 'Number of items'
set ylabel 'False positive probability'
plot 'false_pos.data' using 1:2 with lines linestyle 1 title 'Truncated hashes', \
'false_pos.data' using 1:3 with lines linestyle 2 title 'Bloom filter'
# Use Python 3 to run this
bits_per_item = 10
hashes = 7
import math
import sys
# Probability of collision when each hash is truncated to b bits:
# n = number of items
# b = number of bits per item
#
# P(false positive) =
# P(a given b-bit pattern is among the n items) =
# 1 - P(the given b-bit pattern is different from all of the n hashes) =
# 1 - (1 - 2^-b)^n =
# 1 - (2^b - 1)^n / 2^bn =
# (2^bn - (2^b - 1)^n) / 2^bn
def truncated_hash(bits_per_item, items):
numerator = 2 ** (bits_per_item * items) - (2**bits_per_item - 1) ** items
denominator = 2 ** (bits_per_item * items)
return numerator / denominator
# Classic Bloom filter false positive probability formula
# (this is not exact, but rather a lower bound):
# k = number of hash functions
# n = number of items
# m = number of bits
#
# kn = number of bit-setting operations, each of which sets one of the m bits to 1
# P(a given bit is 0) = (1 - 1/m)^kn
#
# P(false positive) =
# P(k given bits are all 1) =
# (1 - (1 - 1/m)^kn)^k =
# (1 - (m - 1)^kn / m^kn)^k =
# (m^kn - (m - 1)^kn)^k / m^kkn
def classic_bloom(bits, items, hashes):
numerator = (bits ** (hashes * items) - (bits - 1) ** (hashes * items)) ** hashes
denominator = bits ** (hashes * hashes * items)
return numerator / denominator
# Binomial coefficient
# Based on https://jamesmccaffrey.wordpress.com/2020/07/30/computing-a-stirling-number-of-the-second-kind-from-scratch-using-python/
def binomial(n, k):
if n == k: return 1
if k < n - k:
delta = n - k
i_max = k
else:
delta = k
i_max = n - k
ans = delta + 1
for i in range(2, i_max + 1):
ans = (ans * (delta + i)) // i
return ans
# Stirling number of the second kind https://en.wikipedia.org/wiki/Stirling_numbers_of_the_second_kind
def stirling(n, k):
sum = 0
for i in range(0, k + 1):
sum += (-1)**(k - i) * binomial(k, i) * i**n
return sum // math.factorial(k)
# Correct Bloom filter false positive probability, according to
# https://www.sciencedirect.com/science/article/pii/S0020019008001579
def correct_bloom(bits, items, hashes):
numerator = 0
for i in range(1, bits + 1):
numerator += i**hashes * math.factorial(i) * binomial(bits, i) * stirling(hashes * items, i)
denominator = bits ** (hashes * (items + 1))
return numerator / denominator
if __name__ == "__main__":
if len(sys.argv) != 2 or (sys.argv[1] != "-plot" and sys.argv[1] != "-exact"):
print("Usage: false_pos.py -plot or false_pos.py -exact")
elif sys.argv[1] == "-plot":
for i in range(120):
items = round(math.exp(i/12))
trunc = truncated_hash(bits_per_item, items)
bloom = classic_bloom(bits_per_item * items, items, hashes)
print(f"{items} {trunc} {bloom}")
elif sys.argv[1] == "-exact":
for items in [1, 10, 100, 1000, 10000]:
print(f"With {items} items and {bits_per_item} bits per item:")
print(f"Truncated hash: {truncated_hash(bits_per_item, items)}")
print(f"Classic Bloom: {classic_bloom(bits_per_item * items, items, hashes)}")
if items <= 100: # it takes about a minute for 100; don't bother for bigger numbers
print(f"Correct Bloom: {correct_bloom(bits_per_item * items, items, hashes)}")
print("")
# Output of false_pos.py -exact:
#
# With 1 items and 10 bits per item:
# Truncated hash: 0.0009765625
# Classic Bloom: 0.010518744866970362
# Correct Bloom: 0.0174705766201
#
# With 10 items and 10 bits per item:
# Truncated hash: 0.009722821223700424
# Classic Bloom: 0.008394807630049734
# Correct Bloom: 0.008936311594679473
#
# With 100 items and 10 bits per item:
# Truncated hash: 0.09308265650895885
# Classic Bloom: 0.008213554634050216
# Correct Bloom: 0.008266247514843566
#
# With 1000 items and 10 bits per item:
# Truncated hash: 0.623576201943276
# Classic Bloom: 0.008195702596768733
#
# With 10000 items and 10 bits per item:
# Truncated hash: 0.9999428822983674
# Classic Bloom: 0.008193920091727517
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment