Skip to content

Instantly share code, notes, and snippets.

@syncom
Last active May 28, 2021 19:36
Show Gist options
  • Save syncom/505fa0e176ed90c12bbe815597b29c0d to your computer and use it in GitHub Desktop.
Save syncom/505fa0e176ed90c12bbe815597b29c0d to your computer and use it in GitHub Desktop.
Simulate Truncated MD5 Digest Collisions for a Data Set

Simulate Truncated MD5 Digest Collisions for a Data Set

On a single core

#!/usr/bin/env python3

'''Simulate the number of truncated hash (MD5) collisions for a data set.

data set size: num_records
truncated hash length in bits: truncated_size_in_bits
'''

import hashlib as hl

def truncated_digests(num_records, truncated_size_in_bits):
    d = dict()
    for i in range(num_records):
        dgst = hl.md5(str(i).encode('utf8')).hexdigest()
        dgst_binstr = bin(int(dgst, 16))[2:].zfill(128)
        truncated_dgst = dgst_binstr[:truncated_size_in_bits]
        d[truncated_dgst] = d[truncated_dgst] + [i] if truncated_dgst in d else [i]

    return d

if __name__ == '__main__':
    # Change this value for size of data set
    num_records = 170 * 1000 * 1000
    # Change this value for truncated hash digest in bytes
    truncated_size_in_bits = 25
    td = truncated_digests(num_records, truncated_size_in_bits)
    # keys as integers
    ks = [int(k, 2) for k in td]
    # values as frequency
    vs = [len(td[k]) for k in td]
    # max and min
    m = max(vs)
    n = min(vs)

    # print info
    print("num_records: ", num_records)
    print("truncated_size_in_bits: ", truncated_size_in_bits)
    print("size of dict", len(td))
    print('max collision: ', m)
    print('min collision: ', n)
    # save histogram data to file for plotting
    hist_file = '/tmp/hist.dat'
    with open(hist_file, 'w') as fh:
        fh.write('# hash        freq\n')
        for k, v in zip(ks, vs):
            fh.write('  %s        %s\n' % (k, v))
    print('Histogram data written to /tmp/hist.dat')

Run the code

time python3 trunc_hash_collision_sim.py
num_records:  170000000
truncated_size_in_bits:  25
size of dict 33342649
max collision:  21
min collision:  1
Histogram data written to /tmp/hist.dat

real	28m57.544s
user	28m41.997s
sys	0m13.492s

Plot histogram of colliding truncated hash values using gnuplot

gnuplot> set term png

Terminal type is now 'png'
Options are 'nocrop enhanced size 640,480 font "arial,12.0" '
gnuplot> set output "/tmp/hist.png"
gnuplot> set boxwidth 0.9 relative
gnuplot> set style fill solid 1.0
gnuplot> plot "/tmp/hist.dat" with boxes

The gnuplot generated histogram image is attached to this gist.

Use multiple cores

#!/usr/bin/env python3

'''
Simulate the number of truncated hash (MD5) collisions for a data set.
Multi-process support.

'''

import hashlib as hl
import os
from multiprocessing import Pool, cpu_count
from functools import partial

def trunc_md5(msg, trunc_size_in_bits):
    '''msg's MD5 digest truncated to trunc_size_in_bits bits
    '''
    dgst = hl.md5(msg.encode('utf8')).hexdigest()
    dgst_binstr = bin(int(dgst, 16))[2:].zfill(128)
    truncated_dgst = dgst_binstr[:trunc_size_in_bits]
    return truncated_dgst

def collisions_dict(trunc_md5_list):
    d = dict()
    for v in trunc_md5_list:
        d[v] = d[v] + 1 if v in d else 1

    return d

if __name__ == '__main__':
    # Change this value for size of data set
    num_records = 170 * 1000 * 1000
    # Change this value for truncated hash digest in bytes
    truncated_size_in_bits = 25
    ncpu = cpu_count()
    ncpu_used = max(ncpu - 2, 1)
    print('have', ncpu, 'CPU(s)')
    print('using', ncpu_used, 'CPU(s)')

    with Pool(processes=ncpu_used) as pool:
        tml_it = pool.imap_unordered(
            partial(trunc_md5, trunc_size_in_bits=truncated_size_in_bits),
            [str(i) for i in range(num_records)],
            chunksize=1000)
        td = collisions_dict([v for v in tml_it])

    # keys as integers
    ks = [int(k, 2) for k in td]
    # values as frequency
    vs = [td[k] for k in td]
    # max and min
    m = max(vs)
    n = min(vs)

    # print info
    print("num_records: ", num_records)
    print("truncated_size_in_bits: ", truncated_size_in_bits)
    print("size of dict", len(td))
    print('max collision: ', m)
    print('min collision: ', n)
    # save histogram data to file for plotting
    hist_file = '/tmp/hist.dat'
    with open(hist_file, 'w') as fh:
        fh.write('# hash        freq\n')
        for k, v in zip(ks, vs):
            fh.write('  %s        %s\n' % (k, v))
    print('Histogram data written to /tmp/hist.dat')

Run the code

$ time python3 trunc_hash_collisions_sim_mp.py 
have 32 CPU(s)
using 30 CPU(s)
num_records:  170000000
truncated_size_in_bits:  25
size of dict 33342649
max collision:  21
min collision:  1
Histogram data written to /tmp/hist.dat

real    5m3.022s
user    23m23.980s
sys     0m51.092s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment