hash behavior simulation in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
import sys | |
import hashlib | |
import math | |
def count_collision(nbucket, nkey): | |
buckets = [0] * nbucket | |
for i in range(nkey): | |
hsh = hashlib.sha1(str(i)).digest() | |
buckets[hash(hsh) % nbucket] += 1 | |
buckets.sort() | |
counter = {} | |
for n in buckets: | |
if n not in counter: | |
counter[n] = 0 | |
counter[n] += 1 | |
for n, b in counter.items(): | |
print b, "buckets with", n, "keys", str(int(float(b)/nbucket*100)) + '%' | |
if __name__ == "__main__": | |
b, k = sys.argv[1:] | |
b, k = int(b), int(k) | |
count_collision( b, k) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.6 | |
# coding: utf-8 | |
import sys | |
import math | |
import hashlib | |
def normal_pdf(x, mu, sigma): | |
x = float(x) | |
mu = float(mu) | |
m = 1.0 / math.sqrt( 2 * math.pi ) / sigma | |
n = math.exp(-(x-mu)**2 / (2*sigma*sigma)) | |
return m * n | |
def normal_cdf(x, mu, sigma): | |
# integral(-oo,x) | |
x = float(x) | |
mu = float(mu) | |
sigma = float(sigma) | |
# to standard form | |
x = (x - mu) / sigma | |
s = x | |
v = x | |
for i in range(1, 100): | |
v = v * x * x / (2*i+1) | |
s += v | |
return 0.5 + s/(2*math.pi)**0.5 * math.e ** (-x*x/2) | |
def difference(nbucket, nkey): | |
nbucket, nkey= int(nbucket), int(nkey) | |
# binomial distribution approximation by normal distribution | |
# find the bucket with minimal keys. | |
# | |
# the probability that a bucket has exactly i keys is: | |
# # probability density function | |
# normal_pdf(i, mu, sigma) | |
# | |
# the probability that a bucket has 0 ~ i keys is: | |
# # cumulative distribution function | |
# normal_cdf(i, mu, sigma) | |
# | |
# if the probability that a bucket has 0 ~ i keys is greater than 1/nbucket, we | |
# say there will be a bucket in hash table has: | |
# (i_0*p_0 + i_1*p_1 + ...)/(p_0 + p_1 + ..) keys. | |
p = 1.0 / nbucket | |
mu = nkey * p | |
sigma = math.sqrt(nkey * p * (1-p)) | |
target = 1.0 / nbucket | |
minimal = mu | |
while True: | |
xx = normal_cdf(minimal, mu, sigma) | |
if abs(xx-target) < target/10: | |
break | |
minimal -= 1 | |
return minimal, (mu-minimal) * 2 / (mu + (mu - minimal)) | |
def difference_simulation(nbucket, nkey): | |
nbucket, nkey= int(nbucket), int(nkey) | |
buckets = [0] * nbucket | |
for i in range(nkey): | |
hsh = hashlib.sha1(str(i)).digest() | |
buckets[hash(hsh) % nbucket] += 1 | |
buckets.sort() | |
nmin, mmax = buckets[0], buckets[-1] | |
return nmin, float(mmax - nmin) / mmax | |
if __name__ == "__main__": | |
nbucket, nkey= sys.argv[1:] | |
minimal, rate = difference(nbucket, nkey) | |
print 'by normal distribution:' | |
print ' min_bucket:', minimal | |
print ' difference:', rate | |
minimal, rate = difference_simulation(nbucket, nkey) | |
print 'by simulation:' | |
print ' min_bucket:', minimal | |
print ' difference:', rate |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
import sys | |
import hashlib | |
import math | |
def make_buckets(nbucket, nkey): | |
buckets = [0] * nbucket | |
for i in range(nkey): | |
hsh = hashlib.sha1(str(i)).digest() | |
buckets[hash(hsh) % nbucket] += 1 | |
buckets.sort() | |
return buckets | |
def uniformity(nbucket, nkey, nlines=20): | |
avg = nkey / nbucket | |
nb = nbucket / nlines | |
buckets = make_buckets(nbucket, nkey) | |
for i in range(nlines): | |
n = sum(buckets[i*nb : i*nb+nb]) / nb | |
print '%02.2f%%' % ( float(n*100) / avg ) | |
if __name__ == "__main__": | |
b, k = sys.argv[1:] | |
b, k = int(b), int(k) | |
uniformity( b, k) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
import math | |
import sys | |
pempty = lambda x: math.e ** (-x) | |
pone = lambda x: x * math.e ** (-x) | |
pcollision = lambda x: 1 - (1+x) * math.e ** (-x) | |
perc = lambda x: '%02.0f%%' % (x*100) | |
if __name__ == "__main__": | |
loadfactors = [float(x) for x in sys.argv[1:]] | |
for lf in loadfactors: | |
print 'load factor: %02.2f, empty: %s, 1-key: %s, collision: %s' % ( | |
lf, | |
perc(pempty(lf)), | |
perc(pone(lf)), | |
perc(pcollision(lf)), | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment