-
-
Save ept/83b91aa07e2495c86ddd8c364a8cfbc7 to your computer and use it in GitHub Desktop.
False positive probability: Bloom filter vs. truncated hashes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set terminal png size 1100,400 | |
set output 'false_pos.png' | |
set key left top | |
set logscale x | |
set xrange [1:10000] | |
set style line 1 linewidth 2.5 linecolor rgb '#4472C4' | |
set style line 2 linewidth 2.5 linecolor rgb '#ED7D31' | |
set xlabel 'Number of items' | |
set ylabel 'False positive probability' | |
plot 'false_pos.data' using 1:2 with lines linestyle 1 title 'Truncated hashes', \ | |
'false_pos.data' using 1:3 with lines linestyle 2 title 'Bloom filter' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Use Python 3 to run this | |
bits_per_item = 10 | |
hashes = 7 | |
import math | |
import sys | |
# Probability of collision when each hash is truncated to b bits: | |
# n = number of items | |
# b = number of bits per item | |
# | |
# P(false positive) = | |
# P(a given b-bit pattern is among the n items) = | |
# 1 - P(the given b-bit pattern is different from all of the n hashes) = | |
# 1 - (1 - 2^-b)^n = | |
# 1 - (2^b - 1)^n / 2^bn = | |
# (2^bn - (2^b - 1)^n) / 2^bn | |
def truncated_hash(bits_per_item, items): | |
numerator = 2 ** (bits_per_item * items) - (2**bits_per_item - 1) ** items | |
denominator = 2 ** (bits_per_item * items) | |
return numerator / denominator | |
# Classic Bloom filter false positive probability formula | |
# (this is not exact, but rather a lower bound): | |
# k = number of hash functions | |
# n = number of items | |
# m = number of bits | |
# | |
# kn = number of bit-setting operations, each of which sets one of the m bits to 1 | |
# P(a given bit is 0) = (1 - 1/m)^kn | |
# | |
# P(false positive) = | |
# P(k given bits are all 1) = | |
# (1 - (1 - 1/m)^kn)^k = | |
# (1 - (m - 1)^kn / m^kn)^k = | |
# (m^kn - (m - 1)^kn)^k / m^kkn | |
def classic_bloom(bits, items, hashes): | |
numerator = (bits ** (hashes * items) - (bits - 1) ** (hashes * items)) ** hashes | |
denominator = bits ** (hashes * hashes * items) | |
return numerator / denominator | |
# Binomial coefficient | |
# Based on https://jamesmccaffrey.wordpress.com/2020/07/30/computing-a-stirling-number-of-the-second-kind-from-scratch-using-python/ | |
def binomial(n, k): | |
if n == k: return 1 | |
if k < n - k: | |
delta = n - k | |
i_max = k | |
else: | |
delta = k | |
i_max = n - k | |
ans = delta + 1 | |
for i in range(2, i_max + 1): | |
ans = (ans * (delta + i)) // i | |
return ans | |
# Stirling number of the second kind https://en.wikipedia.org/wiki/Stirling_numbers_of_the_second_kind | |
def stirling(n, k): | |
sum = 0 | |
for i in range(0, k + 1): | |
sum += (-1)**(k - i) * binomial(k, i) * i**n | |
return sum // math.factorial(k) | |
# Correct Bloom filter false positive probability, according to | |
# https://www.sciencedirect.com/science/article/pii/S0020019008001579 | |
def correct_bloom(bits, items, hashes): | |
numerator = 0 | |
for i in range(1, bits + 1): | |
numerator += i**hashes * math.factorial(i) * binomial(bits, i) * stirling(hashes * items, i) | |
denominator = bits ** (hashes * (items + 1)) | |
return numerator / denominator | |
if __name__ == "__main__": | |
if len(sys.argv) != 2 or (sys.argv[1] != "-plot" and sys.argv[1] != "-exact"): | |
print("Usage: false_pos.py -plot or false_pos.py -exact") | |
elif sys.argv[1] == "-plot": | |
for i in range(120): | |
items = round(math.exp(i/12)) | |
trunc = truncated_hash(bits_per_item, items) | |
bloom = classic_bloom(bits_per_item * items, items, hashes) | |
print(f"{items} {trunc} {bloom}") | |
elif sys.argv[1] == "-exact": | |
for items in [1, 10, 100, 1000, 10000]: | |
print(f"With {items} items and {bits_per_item} bits per item:") | |
print(f"Truncated hash: {truncated_hash(bits_per_item, items)}") | |
print(f"Classic Bloom: {classic_bloom(bits_per_item * items, items, hashes)}") | |
if items <= 100: # it takes about a minute for 100; don't bother for bigger numbers | |
print(f"Correct Bloom: {correct_bloom(bits_per_item * items, items, hashes)}") | |
print("") | |
# Output of false_pos.py -exact: | |
# | |
# With 1 items and 10 bits per item: | |
# Truncated hash: 0.0009765625 | |
# Classic Bloom: 0.010518744866970362 | |
# Correct Bloom: 0.0174705766201 | |
# | |
# With 10 items and 10 bits per item: | |
# Truncated hash: 0.009722821223700424 | |
# Classic Bloom: 0.008394807630049734 | |
# Correct Bloom: 0.008936311594679473 | |
# | |
# With 100 items and 10 bits per item: | |
# Truncated hash: 0.09308265650895885 | |
# Classic Bloom: 0.008213554634050216 | |
# Correct Bloom: 0.008266247514843566 | |
# | |
# With 1000 items and 10 bits per item: | |
# Truncated hash: 0.623576201943276 | |
# Classic Bloom: 0.008195702596768733 | |
# | |
# With 10000 items and 10 bits per item: | |
# Truncated hash: 0.9999428822983674 | |
# Classic Bloom: 0.008193920091727517 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment