drmingdrmer/hash_bucket_count_simulation.py

## hash_bucket_count_simulation.py
#!/usr/bin/env python
# coding: utf-8

import sys
import hashlib
import math

def count_collision(nbucket, nkey):

    buckets = [0] * nbucket

    for i in range(nkey):
        hsh = hashlib.sha1(str(i)).digest()
        buckets[hash(hsh) % nbucket] += 1

    buckets.sort()
    counter = {}

    for n in buckets:
        if n not in counter:
            counter[n] = 0

        counter[n] += 1

    for n, b in counter.items():
        print b, "buckets with", n, "keys", str(int(float(b)/nbucket*100)) + '%'

if __name__ == "__main__":
    b, k = sys.argv[1:]
    b, k = int(b), int(k)
    count_collision( b, k)

## hash_distribution.py
#!/usr/bin/env python2.6
# coding: utf-8

import sys
import math
import hashlib

def normal_pdf(x, mu, sigma):
    x = float(x)
    mu = float(mu)

    m = 1.0 / math.sqrt( 2 * math.pi ) / sigma
    n = math.exp(-(x-mu)**2 / (2*sigma*sigma))

    return m * n

def normal_cdf(x, mu, sigma):
    # integral(-oo,x)

    x = float(x)
    mu = float(mu)
    sigma = float(sigma)

    # to standard form
    x = (x - mu) / sigma

    s = x
    v = x
    for i in range(1, 100):
        v = v * x * x / (2*i+1)
        s += v

    return 0.5 + s/(2*math.pi)**0.5 * math.e ** (-x*x/2)

def difference(nbucket, nkey):

    nbucket, nkey= int(nbucket), int(nkey)

    # binomial distribution approximation by normal distribution
    # find the bucket with minimal keys.
    #
    # the probability that a bucket has exactly i keys is:
    #   # probability density function
    #   normal_pdf(i, mu, sigma)
    #
    # the probability that a bucket has 0 ~ i keys is:
    #   # cumulative distribution function
    #   normal_cdf(i, mu, sigma)
    #
    # if the probability that a bucket has 0 ~ i keys is greater than 1/nbucket, we
    # say there will be a bucket in hash table has:
    # (i_0*p_0 + i_1*p_1 + ...)/(p_0 + p_1 + ..) keys.
    p = 1.0 / nbucket
    mu = nkey * p
    sigma = math.sqrt(nkey * p * (1-p))

    target = 1.0 / nbucket
    minimal = mu
    while True:

        xx = normal_cdf(minimal, mu, sigma)
        if abs(xx-target) < target/10:
            break

        minimal -= 1

    return minimal, (mu-minimal) * 2 / (mu + (mu - minimal))

def difference_simulation(nbucket, nkey):

    nbucket, nkey= int(nbucket), int(nkey)

    buckets = [0] * nbucket

    for i in range(nkey):
        hsh = hashlib.sha1(str(i)).digest()
        buckets[hash(hsh) % nbucket] += 1

    buckets.sort()
    nmin, mmax = buckets[0], buckets[-1]

    return nmin, float(mmax - nmin) / mmax

if __name__ == "__main__":

    nbucket, nkey= sys.argv[1:]

    minimal, rate = difference(nbucket, nkey)
    print 'by normal distribution:'
    print '     min_bucket:', minimal
    print '     difference:', rate

    minimal, rate = difference_simulation(nbucket, nkey)
    print 'by simulation:'
    print '     min_bucket:', minimal
    print '     difference:', rate

## hash_distribution_simulation.py
#!/usr/bin/env python
# coding: utf-8

import sys
import hashlib
import math

def make_buckets(nbucket, nkey):
    buckets = [0] * nbucket

    for i in range(nkey):
        hsh = hashlib.sha1(str(i)).digest()
        buckets[hash(hsh) % nbucket] += 1

    buckets.sort()
    return buckets

def uniformity(nbucket, nkey, nlines=20):

    avg = nkey / nbucket

    nb = nbucket / nlines

    buckets = make_buckets(nbucket, nkey)
    for i in range(nlines):
        n = sum(buckets[i*nb : i*nb+nb]) / nb
        print '%02.2f%%' % ( float(n*100) / avg )

if __name__ == "__main__":
    b, k = sys.argv[1:]
    b, k = int(b), int(k)
    uniformity( b, k)

## loadfactor.py
#!/usr/bin/env python
# coding: utf-8

import math
import sys

pempty = lambda x: math.e ** (-x)
pone = lambda x: x * math.e ** (-x)
pcollision = lambda x: 1 - (1+x) * math.e ** (-x)
perc = lambda x: '%02.0f%%' % (x*100)

if __name__ == "__main__":
    loadfactors = [float(x) for x in sys.argv[1:]]
    for lf in loadfactors:
        print 'load factor: %02.2f,  empty: %s, 1-key: %s, collision: %s' % (
                lf,
                perc(pempty(lf)),
                perc(pone(lf)),
                perc(pcollision(lf)),
        )
	#!/usr/bin/env python
	# coding: utf-8

	import sys
	import hashlib
	import math

	def count_collision(nbucket, nkey):

	buckets = [0] * nbucket

	for i in range(nkey):
	hsh = hashlib.sha1(str(i)).digest()
	buckets[hash(hsh) % nbucket] += 1

	buckets.sort()
	counter = {}

	for n in buckets:
	if n not in counter:
	counter[n] = 0

	counter[n] += 1

	for n, b in counter.items():
	print b, "buckets with", n, "keys", str(int(float(b)/nbucket*100)) + '%'

	if __name__ == "__main__":
	b, k = sys.argv[1:]
	b, k = int(b), int(k)
	count_collision( b, k)
	#!/usr/bin/env python2.6
	# coding: utf-8

	import sys
	import math
	import hashlib

	def normal_pdf(x, mu, sigma):
	x = float(x)
	mu = float(mu)

	m = 1.0 / math.sqrt( 2 * math.pi ) / sigma
	n = math.exp(-(x-mu)*2 / (2sigma*sigma))

	return m * n

	def normal_cdf(x, mu, sigma):
	# integral(-oo,x)

	x = float(x)
	mu = float(mu)
	sigma = float(sigma)

	# to standard form
	x = (x - mu) / sigma

	s = x
	v = x
	for i in range(1, 100):
	v = v * x * x / (2*i+1)
	s += v

	return 0.5 + s/(2math.pi)0.5 math.e ** (-x*x/2)

	def difference(nbucket, nkey):

	nbucket, nkey= int(nbucket), int(nkey)

	# binomial distribution approximation by normal distribution
	# find the bucket with minimal keys.
	#
	# the probability that a bucket has exactly i keys is:
	# # probability density function
	# normal_pdf(i, mu, sigma)
	#
	# the probability that a bucket has 0 ~ i keys is:
	# # cumulative distribution function
	# normal_cdf(i, mu, sigma)
	#
	# if the probability that a bucket has 0 ~ i keys is greater than 1/nbucket, we
	# say there will be a bucket in hash table has:
	# (i_0p_0 + i_1p_1 + ...)/(p_0 + p_1 + ..) keys.
	p = 1.0 / nbucket
	mu = nkey * p
	sigma = math.sqrt(nkey * p * (1-p))

	target = 1.0 / nbucket
	minimal = mu
	while True:

	xx = normal_cdf(minimal, mu, sigma)
	if abs(xx-target) < target/10:
	break

	minimal -= 1

	return minimal, (mu-minimal) * 2 / (mu + (mu - minimal))

	def difference_simulation(nbucket, nkey):

	nbucket, nkey= int(nbucket), int(nkey)

	buckets = [0] * nbucket

	for i in range(nkey):
	hsh = hashlib.sha1(str(i)).digest()
	buckets[hash(hsh) % nbucket] += 1

	buckets.sort()
	nmin, mmax = buckets[0], buckets[-1]

	return nmin, float(mmax - nmin) / mmax

	if __name__ == "__main__":

	nbucket, nkey= sys.argv[1:]

	minimal, rate = difference(nbucket, nkey)
	print 'by normal distribution:'
	print ' min_bucket:', minimal
	print ' difference:', rate

	minimal, rate = difference_simulation(nbucket, nkey)
	print 'by simulation:'
	print ' min_bucket:', minimal
	print ' difference:', rate