Skip to content

Instantly share code, notes, and snippets.

@MartinThoma
Created May 20, 2012 17:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MartinThoma/2758792 to your computer and use it in GitHub Desktop.
Save MartinThoma/2758792 to your computer and use it in GitHub Desktop.
Sum of digits as hash
#!/usr/bin/python
# -*- coding: utf-8 -*-
from math import log, ceil
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.mlab as mlab
try:
from pylab import *
except ImportError:
print("Please install pylab (sudo apt-get install python-matplotlib).")
raise
def getMaxValue(end):
return int(ceil(log(end)/log(10)) * 9 + 1)
def sumOfDigits(integer):
""" Calculate the sum of digits of a non-negative integer. """
string = str(integer)
sumOfDigits = 0
for digit in string:
sumOfDigits += int(digit)
return sumOfDigits
def hashCollisions(start, end):
"""
Get the number of hash collisons, if you take the sum of
digits as a hash.
"""
hashed = {}
maxValue = getMaxValue(end)
for s in xrange(0, maxValue):
hashed[s] = 0
for i in xrange(start, end + 1):
s = sumOfDigits(i)
hashed[s] += 1
return hashed
def stochastik(start, end, hashed):
""" Calculate the mean and the variance. """
mean = 0.0
variance = 0.0
numberOfItems = end - start
squaredSum = 0.0
for sumOfDigits, value in hashed.iteritems():
probability = float(value) / numberOfItems
mean += float(sumOfDigits) * probability
squaredSum += float(sumOfDigits) * sumOfDigits * probability
variance = squaredSum - mean * mean
return (mean, variance)
def plotIt(hashed, end, mean, variance):
""" Plot the mapped hashes """
# It's easy to see that the sum of digits is normal distributed
# with \mu =
maxValue = getMaxValue(end)
print maxValue
liste = [hashed[el] for el in xrange(0, maxValue)]
#plot(liste)
xlabel('Sum of digits')
ylabel('number of occurences')
title(r'$y=Number of hash values mapped to x$')
x = np.linspace(0,maxValue,100)
#plt.plot(x,mlab.normpdf(x,mean,variance))
plt.plot(liste)
plt.show()
start = 0
end = 1000000
h = hashCollisions(start, end)
mean, variance = stochastik(start, end, h)
print("Mean: %f, Variance: %f" % (mean, variance))
plotIt(h, end, mean, variance)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment