ept/false_pos.gnuplot Secret

## false_pos.gnuplot
set terminal png size 1100,400
set output 'false_pos.png'
set key left top
set logscale x
set xrange [1:10000]
set style line 1 linewidth 2.5 linecolor rgb '#4472C4'
set style line 2 linewidth 2.5 linecolor rgb '#ED7D31'

set xlabel 'Number of items'
set ylabel 'False positive probability'
plot 'false_pos.data' using 1:2 with lines linestyle 1 title 'Truncated hashes', \
     'false_pos.data' using 1:3 with lines linestyle 2 title 'Bloom filter'

## false_pos.py
# Use Python 3 to run this

bits_per_item = 10
hashes = 7

import math
import sys

# Probability of collision when each hash is truncated to b bits:
# n = number of items
# b = number of bits per item
#
# P(false positive) =
# P(a given b-bit pattern is among the n items) =
# 1 - P(the given b-bit pattern is different from all of the n hashes) =
# 1 - (1 - 2^-b)^n =
# 1 - (2^b - 1)^n / 2^bn =
# (2^bn - (2^b - 1)^n) / 2^bn

def truncated_hash(bits_per_item, items):
    numerator = 2 ** (bits_per_item * items) - (2**bits_per_item - 1) ** items
    denominator = 2 ** (bits_per_item * items)
    return numerator / denominator

# Classic Bloom filter false positive probability formula
# (this is not exact, but rather a lower bound):
# k = number of hash functions
# n = number of items
# m = number of bits
#
# kn = number of bit-setting operations, each of which sets one of the m bits to 1
# P(a given bit is 0) = (1 - 1/m)^kn
#
# P(false positive) =
# P(k given bits are all 1) =
# (1 - (1 - 1/m)^kn)^k =
# (1 - (m - 1)^kn / m^kn)^k =
# (m^kn - (m - 1)^kn)^k / m^kkn

def classic_bloom(bits, items, hashes):
    numerator = (bits ** (hashes * items) - (bits - 1) ** (hashes * items)) ** hashes
    denominator = bits ** (hashes * hashes * items)
    return numerator / denominator

# Binomial coefficient
# Based on https://jamesmccaffrey.wordpress.com/2020/07/30/computing-a-stirling-number-of-the-second-kind-from-scratch-using-python/
def binomial(n, k):
  if n == k: return 1
  if k < n - k:
    delta = n - k
    i_max = k
  else:
    delta = k
    i_max = n - k

  ans = delta + 1
  for i in range(2, i_max + 1):
    ans = (ans * (delta + i)) // i
  return ans

# Stirling number of the second kind https://en.wikipedia.org/wiki/Stirling_numbers_of_the_second_kind
def stirling(n, k):
    sum = 0
    for i in range(0, k + 1):
        sum += (-1)**(k - i) * binomial(k, i) * i**n
    return sum // math.factorial(k)

# Correct Bloom filter false positive probability, according to
# https://www.sciencedirect.com/science/article/pii/S0020019008001579
def correct_bloom(bits, items, hashes):
    numerator = 0
    for i in range(1, bits + 1):
        numerator += i**hashes * math.factorial(i) * binomial(bits, i) * stirling(hashes * items, i)
    denominator = bits ** (hashes * (items + 1))
    return numerator / denominator

if __name__ == "__main__":
    if len(sys.argv) != 2 or (sys.argv[1] != "-plot" and sys.argv[1] != "-exact"):
        print("Usage: false_pos.py -plot or false_pos.py -exact")
    elif sys.argv[1] == "-plot":
        for i in range(120):
            items = round(math.exp(i/12))
            trunc = truncated_hash(bits_per_item, items)
            bloom = classic_bloom(bits_per_item * items, items, hashes)
            print(f"{items} {trunc} {bloom}")
    elif sys.argv[1] == "-exact":
        for items in [1, 10, 100, 1000, 10000]:
            print(f"With {items} items and {bits_per_item} bits per item:")
            print(f"Truncated hash: {truncated_hash(bits_per_item, items)}")
            print(f"Classic Bloom: {classic_bloom(bits_per_item * items, items, hashes)}")
            if items <= 100: # it takes about a minute for 100; don't bother for bigger numbers
                print(f"Correct Bloom: {correct_bloom(bits_per_item * items, items, hashes)}")
            print("")

# Output of false_pos.py -exact:
#
# With 1 items and 10 bits per item:
# Truncated hash: 0.0009765625
# Classic Bloom: 0.010518744866970362
# Correct Bloom: 0.0174705766201
#
# With 10 items and 10 bits per item:
# Truncated hash: 0.009722821223700424
# Classic Bloom: 0.008394807630049734
# Correct Bloom: 0.008936311594679473
#
# With 100 items and 10 bits per item:
# Truncated hash: 0.09308265650895885
# Classic Bloom: 0.008213554634050216
# Correct Bloom: 0.008266247514843566
#
# With 1000 items and 10 bits per item:
# Truncated hash: 0.623576201943276
# Classic Bloom: 0.008195702596768733
#
# With 10000 items and 10 bits per item:
# Truncated hash: 0.9999428822983674
# Classic Bloom: 0.008193920091727517
	set terminal png size 1100,400
	set output 'false_pos.png'
	set key left top
	set logscale x
	set xrange [1:10000]
	set style line 1 linewidth 2.5 linecolor rgb '#4472C4'
	set style line 2 linewidth 2.5 linecolor rgb '#ED7D31'

	set xlabel 'Number of items'
	set ylabel 'False positive probability'
	plot 'false_pos.data' using 1:2 with lines linestyle 1 title 'Truncated hashes', \
	'false_pos.data' using 1:3 with lines linestyle 2 title 'Bloom filter'
	# Use Python 3 to run this

	bits_per_item = 10
	hashes = 7

	import math
	import sys

	# Probability of collision when each hash is truncated to b bits:
	# n = number of items
	# b = number of bits per item
	#
	# P(false positive) =
	# P(a given b-bit pattern is among the n items) =
	# 1 - P(the given b-bit pattern is different from all of the n hashes) =
	# 1 - (1 - 2^-b)^n =
	# 1 - (2^b - 1)^n / 2^bn =
	# (2^bn - (2^b - 1)^n) / 2^bn

	def truncated_hash(bits_per_item, items):
	numerator = 2 ** (bits_per_item * items) - (2bits_per_item - 1) items
	denominator = 2 ** (bits_per_item * items)
	return numerator / denominator

	# Classic Bloom filter false positive probability formula
	# (this is not exact, but rather a lower bound):
	# k = number of hash functions
	# n = number of items
	# m = number of bits
	#
	# kn = number of bit-setting operations, each of which sets one of the m bits to 1
	# P(a given bit is 0) = (1 - 1/m)^kn
	#
	# P(false positive) =
	# P(k given bits are all 1) =
	# (1 - (1 - 1/m)^kn)^k =
	# (1 - (m - 1)^kn / m^kn)^k =
	# (m^kn - (m - 1)^kn)^k / m^kkn

	def classic_bloom(bits, items, hashes):
	numerator = (bits ** (hashes * items) - (bits - 1) ** (hashes * items)) ** hashes
	denominator = bits ** (hashes * hashes * items)
	return numerator / denominator

	# Binomial coefficient
	# Based on https://jamesmccaffrey.wordpress.com/2020/07/30/computing-a-stirling-number-of-the-second-kind-from-scratch-using-python/
	def binomial(n, k):
	if n == k: return 1
	if k < n - k:
	delta = n - k
	i_max = k
	else:
	delta = k
	i_max = n - k

	ans = delta + 1
	for i in range(2, i_max + 1):
	ans = (ans * (delta + i)) // i
	return ans

	# Stirling number of the second kind https://en.wikipedia.org/wiki/Stirling_numbers_of_the_second_kind
	def stirling(n, k):
	sum = 0
	for i in range(0, k + 1):
	sum += (-1)*(k - i) binomial(k, i) * i**n
	return sum // math.factorial(k)

	# Correct Bloom filter false positive probability, according to
	# https://www.sciencedirect.com/science/article/pii/S0020019008001579
	def correct_bloom(bits, items, hashes):
	numerator = 0
	for i in range(1, bits + 1):
	numerator += i*hashes math.factorial(i) * binomial(bits, i) * stirling(hashes * items, i)
	denominator = bits ** (hashes * (items + 1))
	return numerator / denominator

	if __name__ == "__main__":
	if len(sys.argv) != 2 or (sys.argv[1] != "-plot" and sys.argv[1] != "-exact"):
	print("Usage: false_pos.py -plot or false_pos.py -exact")
	elif sys.argv[1] == "-plot":
	for i in range(120):
	items = round(math.exp(i/12))
	trunc = truncated_hash(bits_per_item, items)
	bloom = classic_bloom(bits_per_item * items, items, hashes)
	print(f"{items} {trunc} {bloom}")
	elif sys.argv[1] == "-exact":
	for items in [1, 10, 100, 1000, 10000]:
	print(f"With {items} items and {bits_per_item} bits per item:")
	print(f"Truncated hash: {truncated_hash(bits_per_item, items)}")
	print(f"Classic Bloom: {classic_bloom(bits_per_item * items, items, hashes)}")
	if items <= 100: # it takes about a minute for 100; don't bother for bigger numbers
	print(f"Correct Bloom: {correct_bloom(bits_per_item * items, items, hashes)}")
	print("")

	# Output of false_pos.py -exact:
	#
	# With 1 items and 10 bits per item:
	# Truncated hash: 0.0009765625
	# Classic Bloom: 0.010518744866970362
	# Correct Bloom: 0.0174705766201
	#
	# With 10 items and 10 bits per item:
	# Truncated hash: 0.009722821223700424
	# Classic Bloom: 0.008394807630049734
	# Correct Bloom: 0.008936311594679473
	#
	# With 100 items and 10 bits per item:
	# Truncated hash: 0.09308265650895885
	# Classic Bloom: 0.008213554634050216
	# Correct Bloom: 0.008266247514843566
	#
	# With 1000 items and 10 bits per item:
	# Truncated hash: 0.623576201943276
	# Classic Bloom: 0.008195702596768733
	#
	# With 10000 items and 10 bits per item:
	# Truncated hash: 0.9999428822983674
	# Classic Bloom: 0.008193920091727517