Create a gist now

Instantly share code, notes, and snippets.

hash behavior simulation in python
#!/usr/bin/env python
# coding: utf-8
import sys
import hashlib
import math
def count_collision(nbucket, nkey):
buckets = [0] * nbucket
for i in range(nkey):
hsh = hashlib.sha1(str(i)).digest()
buckets[hash(hsh) % nbucket] += 1
buckets.sort()
counter = {}
for n in buckets:
if n not in counter:
counter[n] = 0
counter[n] += 1
for n, b in counter.items():
print b, "buckets with", n, "keys", str(int(float(b)/nbucket*100)) + '%'
if __name__ == "__main__":
b, k = sys.argv[1:]
b, k = int(b), int(k)
count_collision( b, k)
#!/usr/bin/env python2.6
# coding: utf-8
import sys
import math
import hashlib
def normal_pdf(x, mu, sigma):
x = float(x)
mu = float(mu)
m = 1.0 / math.sqrt( 2 * math.pi ) / sigma
n = math.exp(-(x-mu)**2 / (2*sigma*sigma))
return m * n
def normal_cdf(x, mu, sigma):
# integral(-oo,x)
x = float(x)
mu = float(mu)
sigma = float(sigma)
# to standard form
x = (x - mu) / sigma
s = x
v = x
for i in range(1, 100):
v = v * x * x / (2*i+1)
s += v
return 0.5 + s/(2*math.pi)**0.5 * math.e ** (-x*x/2)
def difference(nbucket, nkey):
nbucket, nkey= int(nbucket), int(nkey)
# binomial distribution approximation by normal distribution
# find the bucket with minimal keys.
#
# the probability that a bucket has exactly i keys is:
# # probability density function
# normal_pdf(i, mu, sigma)
#
# the probability that a bucket has 0 ~ i keys is:
# # cumulative distribution function
# normal_cdf(i, mu, sigma)
#
# if the probability that a bucket has 0 ~ i keys is greater than 1/nbucket, we
# say there will be a bucket in hash table has:
# (i_0*p_0 + i_1*p_1 + ...)/(p_0 + p_1 + ..) keys.
p = 1.0 / nbucket
mu = nkey * p
sigma = math.sqrt(nkey * p * (1-p))
target = 1.0 / nbucket
minimal = mu
while True:
xx = normal_cdf(minimal, mu, sigma)
if abs(xx-target) < target/10:
break
minimal -= 1
return minimal, (mu-minimal) * 2 / (mu + (mu - minimal))
def difference_simulation(nbucket, nkey):
nbucket, nkey= int(nbucket), int(nkey)
buckets = [0] * nbucket
for i in range(nkey):
hsh = hashlib.sha1(str(i)).digest()
buckets[hash(hsh) % nbucket] += 1
buckets.sort()
nmin, mmax = buckets[0], buckets[-1]
return nmin, float(mmax - nmin) / mmax
if __name__ == "__main__":
nbucket, nkey= sys.argv[1:]
minimal, rate = difference(nbucket, nkey)
print 'by normal distribution:'
print ' min_bucket:', minimal
print ' difference:', rate
minimal, rate = difference_simulation(nbucket, nkey)
print 'by simulation:'
print ' min_bucket:', minimal
print ' difference:', rate
#!/usr/bin/env python
# coding: utf-8
import sys
import hashlib
import math
def make_buckets(nbucket, nkey):
buckets = [0] * nbucket
for i in range(nkey):
hsh = hashlib.sha1(str(i)).digest()
buckets[hash(hsh) % nbucket] += 1
buckets.sort()
return buckets
def uniformity(nbucket, nkey, nlines=20):
avg = nkey / nbucket
nb = nbucket / nlines
buckets = make_buckets(nbucket, nkey)
for i in range(nlines):
n = sum(buckets[i*nb : i*nb+nb]) / nb
print '%02.2f%%' % ( float(n*100) / avg )
if __name__ == "__main__":
b, k = sys.argv[1:]
b, k = int(b), int(k)
uniformity( b, k)
#!/usr/bin/env python
# coding: utf-8
import math
import sys
pempty = lambda x: math.e ** (-x)
pone = lambda x: x * math.e ** (-x)
pcollision = lambda x: 1 - (1+x) * math.e ** (-x)
perc = lambda x: '%02.0f%%' % (x*100)
if __name__ == "__main__":
loadfactors = [float(x) for x in sys.argv[1:]]
for lf in loadfactors:
print 'load factor: %02.2f, empty: %s, 1-key: %s, collision: %s' % (
lf,
perc(pempty(lf)),
perc(pone(lf)),
perc(pcollision(lf)),
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment