Created
May 20, 2012 17:10
-
-
Save MartinThoma/2758792 to your computer and use it in GitHub Desktop.
Sum of digits as hash
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
from math import log, ceil | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import matplotlib.mlab as mlab | |
try: | |
from pylab import * | |
except ImportError: | |
print("Please install pylab (sudo apt-get install python-matplotlib).") | |
raise | |
def getMaxValue(end): | |
return int(ceil(log(end)/log(10)) * 9 + 1) | |
def sumOfDigits(integer): | |
""" Calculate the sum of digits of a non-negative integer. """ | |
string = str(integer) | |
sumOfDigits = 0 | |
for digit in string: | |
sumOfDigits += int(digit) | |
return sumOfDigits | |
def hashCollisions(start, end): | |
""" | |
Get the number of hash collisons, if you take the sum of | |
digits as a hash. | |
""" | |
hashed = {} | |
maxValue = getMaxValue(end) | |
for s in xrange(0, maxValue): | |
hashed[s] = 0 | |
for i in xrange(start, end + 1): | |
s = sumOfDigits(i) | |
hashed[s] += 1 | |
return hashed | |
def stochastik(start, end, hashed): | |
""" Calculate the mean and the variance. """ | |
mean = 0.0 | |
variance = 0.0 | |
numberOfItems = end - start | |
squaredSum = 0.0 | |
for sumOfDigits, value in hashed.iteritems(): | |
probability = float(value) / numberOfItems | |
mean += float(sumOfDigits) * probability | |
squaredSum += float(sumOfDigits) * sumOfDigits * probability | |
variance = squaredSum - mean * mean | |
return (mean, variance) | |
def plotIt(hashed, end, mean, variance): | |
""" Plot the mapped hashes """ | |
# It's easy to see that the sum of digits is normal distributed | |
# with \mu = | |
maxValue = getMaxValue(end) | |
print maxValue | |
liste = [hashed[el] for el in xrange(0, maxValue)] | |
#plot(liste) | |
xlabel('Sum of digits') | |
ylabel('number of occurences') | |
title(r'$y=Number of hash values mapped to x$') | |
x = np.linspace(0,maxValue,100) | |
#plt.plot(x,mlab.normpdf(x,mean,variance)) | |
plt.plot(liste) | |
plt.show() | |
start = 0 | |
end = 1000000 | |
h = hashCollisions(start, end) | |
mean, variance = stochastik(start, end, h) | |
print("Mean: %f, Variance: %f" % (mean, variance)) | |
plotIt(h, end, mean, variance) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment