Instantly share code, notes, and snippets.

ept/false_pos.gnuplot Secret

Last active December 23, 2023 04:51
Show Gist options
• Save ept/83b91aa07e2495c86ddd8c364a8cfbc7 to your computer and use it in GitHub Desktop.
False positive probability: Bloom filter vs. truncated hashes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 set terminal png size 1100,400 set output 'false_pos.png' set key left top set logscale x set xrange [1:10000] set style line 1 linewidth 2.5 linecolor rgb '#4472C4' set style line 2 linewidth 2.5 linecolor rgb '#ED7D31' set xlabel 'Number of items' set ylabel 'False positive probability' plot 'false_pos.data' using 1:2 with lines linestyle 1 title 'Truncated hashes', \ 'false_pos.data' using 1:3 with lines linestyle 2 title 'Bloom filter'
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 # Use Python 3 to run this bits_per_item = 10 hashes = 7 import math import sys # Probability of collision when each hash is truncated to b bits: # n = number of items # b = number of bits per item # # P(false positive) = # P(a given b-bit pattern is among the n items) = # 1 - P(the given b-bit pattern is different from all of the n hashes) = # 1 - (1 - 2^-b)^n = # 1 - (2^b - 1)^n / 2^bn = # (2^bn - (2^b - 1)^n) / 2^bn def truncated_hash(bits_per_item, items): numerator = 2 ** (bits_per_item * items) - (2**bits_per_item - 1) ** items denominator = 2 ** (bits_per_item * items) return numerator / denominator # Classic Bloom filter false positive probability formula # (this is not exact, but rather a lower bound): # k = number of hash functions # n = number of items # m = number of bits # # kn = number of bit-setting operations, each of which sets one of the m bits to 1 # P(a given bit is 0) = (1 - 1/m)^kn # # P(false positive) = # P(k given bits are all 1) = # (1 - (1 - 1/m)^kn)^k = # (1 - (m - 1)^kn / m^kn)^k = # (m^kn - (m - 1)^kn)^k / m^kkn def classic_bloom(bits, items, hashes): numerator = (bits ** (hashes * items) - (bits - 1) ** (hashes * items)) ** hashes denominator = bits ** (hashes * hashes * items) return numerator / denominator # Binomial coefficient # Based on https://jamesmccaffrey.wordpress.com/2020/07/30/computing-a-stirling-number-of-the-second-kind-from-scratch-using-python/ def binomial(n, k): if n == k: return 1 if k < n - k: delta = n - k i_max = k else: delta = k i_max = n - k ans = delta + 1 for i in range(2, i_max + 1): ans = (ans * (delta + i)) // i return ans # Stirling number of the second kind https://en.wikipedia.org/wiki/Stirling_numbers_of_the_second_kind def stirling(n, k): sum = 0 for i in range(0, k + 1): sum += (-1)**(k - i) * binomial(k, i) * i**n return sum // math.factorial(k) # Correct Bloom filter false positive probability, according to # https://www.sciencedirect.com/science/article/pii/S0020019008001579 def correct_bloom(bits, items, hashes): numerator = 0 for i in range(1, bits + 1): numerator += i**hashes * math.factorial(i) * binomial(bits, i) * stirling(hashes * items, i) denominator = bits ** (hashes * (items + 1)) return numerator / denominator if __name__ == "__main__": if len(sys.argv) != 2 or (sys.argv[1] != "-plot" and sys.argv[1] != "-exact"): print("Usage: false_pos.py -plot or false_pos.py -exact") elif sys.argv[1] == "-plot": for i in range(120): items = round(math.exp(i/12)) trunc = truncated_hash(bits_per_item, items) bloom = classic_bloom(bits_per_item * items, items, hashes) print(f"{items} {trunc} {bloom}") elif sys.argv[1] == "-exact": for items in [1, 10, 100, 1000, 10000]: print(f"With {items} items and {bits_per_item} bits per item:") print(f"Truncated hash: {truncated_hash(bits_per_item, items)}") print(f"Classic Bloom: {classic_bloom(bits_per_item * items, items, hashes)}") if items <= 100: # it takes about a minute for 100; don't bother for bigger numbers print(f"Correct Bloom: {correct_bloom(bits_per_item * items, items, hashes)}") print("") # Output of false_pos.py -exact: # # With 1 items and 10 bits per item: # Truncated hash: 0.0009765625 # Classic Bloom: 0.010518744866970362 # Correct Bloom: 0.0174705766201 # # With 10 items and 10 bits per item: # Truncated hash: 0.009722821223700424 # Classic Bloom: 0.008394807630049734 # Correct Bloom: 0.008936311594679473 # # With 100 items and 10 bits per item: # Truncated hash: 0.09308265650895885 # Classic Bloom: 0.008213554634050216 # Correct Bloom: 0.008266247514843566 # # With 1000 items and 10 bits per item: # Truncated hash: 0.623576201943276 # Classic Bloom: 0.008195702596768733 # # With 10000 items and 10 bits per item: # Truncated hash: 0.9999428822983674 # Classic Bloom: 0.008193920091727517