#!/usr/bin/env python3
'''Simulate the number of truncated hash (MD5) collisions for a data set.
data set size: num_records
truncated hash length in bits: truncated_size_in_bits
'''
import hashlib as hl
def truncated_digests(num_records, truncated_size_in_bits):
d = dict()
for i in range(num_records):
dgst = hl.md5(str(i).encode('utf8')).hexdigest()
dgst_binstr = bin(int(dgst, 16))[2:].zfill(128)
truncated_dgst = dgst_binstr[:truncated_size_in_bits]
d[truncated_dgst] = d[truncated_dgst] + [i] if truncated_dgst in d else [i]
return d
if __name__ == '__main__':
# Change this value for size of data set
num_records = 170 * 1000 * 1000
# Change this value for truncated hash digest in bytes
truncated_size_in_bits = 25
td = truncated_digests(num_records, truncated_size_in_bits)
# keys as integers
ks = [int(k, 2) for k in td]
# values as frequency
vs = [len(td[k]) for k in td]
# max and min
m = max(vs)
n = min(vs)
# print info
print("num_records: ", num_records)
print("truncated_size_in_bits: ", truncated_size_in_bits)
print("size of dict", len(td))
print('max collision: ', m)
print('min collision: ', n)
# save histogram data to file for plotting
hist_file = '/tmp/hist.dat'
with open(hist_file, 'w') as fh:
fh.write('# hash freq\n')
for k, v in zip(ks, vs):
fh.write(' %s %s\n' % (k, v))
print('Histogram data written to /tmp/hist.dat')
Run the code
time python3 trunc_hash_collision_sim.py
num_records: 170000000
truncated_size_in_bits: 25
size of dict 33342649
max collision: 21
min collision: 1
Histogram data written to /tmp/hist.dat
real 28m57.544s
user 28m41.997s
sys 0m13.492s
Plot histogram of colliding truncated hash values using gnuplot
gnuplot> set term png
Terminal type is now 'png'
Options are 'nocrop enhanced size 640,480 font "arial,12.0" '
gnuplot> set output "/tmp/hist.png"
gnuplot> set boxwidth 0.9 relative
gnuplot> set style fill solid 1.0
gnuplot> plot "/tmp/hist.dat" with boxes
The gnuplot generated histogram image is attached to this gist.
#!/usr/bin/env python3
'''
Simulate the number of truncated hash (MD5) collisions for a data set.
Multi-process support.
'''
import hashlib as hl
import os
from multiprocessing import Pool, cpu_count
from functools import partial
def trunc_md5(msg, trunc_size_in_bits):
'''msg's MD5 digest truncated to trunc_size_in_bits bits
'''
dgst = hl.md5(msg.encode('utf8')).hexdigest()
dgst_binstr = bin(int(dgst, 16))[2:].zfill(128)
truncated_dgst = dgst_binstr[:trunc_size_in_bits]
return truncated_dgst
def collisions_dict(trunc_md5_list):
d = dict()
for v in trunc_md5_list:
d[v] = d[v] + 1 if v in d else 1
return d
if __name__ == '__main__':
# Change this value for size of data set
num_records = 170 * 1000 * 1000
# Change this value for truncated hash digest in bytes
truncated_size_in_bits = 25
ncpu = cpu_count()
ncpu_used = max(ncpu - 2, 1)
print('have', ncpu, 'CPU(s)')
print('using', ncpu_used, 'CPU(s)')
with Pool(processes=ncpu_used) as pool:
tml_it = pool.imap_unordered(
partial(trunc_md5, trunc_size_in_bits=truncated_size_in_bits),
[str(i) for i in range(num_records)],
chunksize=1000)
td = collisions_dict([v for v in tml_it])
# keys as integers
ks = [int(k, 2) for k in td]
# values as frequency
vs = [td[k] for k in td]
# max and min
m = max(vs)
n = min(vs)
# print info
print("num_records: ", num_records)
print("truncated_size_in_bits: ", truncated_size_in_bits)
print("size of dict", len(td))
print('max collision: ', m)
print('min collision: ', n)
# save histogram data to file for plotting
hist_file = '/tmp/hist.dat'
with open(hist_file, 'w') as fh:
fh.write('# hash freq\n')
for k, v in zip(ks, vs):
fh.write(' %s %s\n' % (k, v))
print('Histogram data written to /tmp/hist.dat')
Run the code
$ time python3 trunc_hash_collisions_sim_mp.py
have 32 CPU(s)
using 30 CPU(s)
num_records: 170000000
truncated_size_in_bits: 25
size of dict 33342649
max collision: 21
min collision: 1
Histogram data written to /tmp/hist.dat
real 5m3.022s
user 23m23.980s
sys 0m51.092s