pdemarti/line_order_invariant_hash.py

## line_order_invariant_hash.py
#---- In[ ]:
%load_ext autoreload
%load_ext Cython
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import random
import string

#---- In[ ]:
#: # Make a random file for testing

#---- In[ ]:
def write_random_file(filename, n_lines):
    with open(filename, 'w') as f:
        for i in range(n_lines):
            name = ''.join(random.sample(string.ascii_lowercase, 8))
            x = random.gauss(0, 1)
            print('{},{:.8f}'.format(name, x), file=f)

#---- In[ ]:
#: # Simple digests (sensitive to line order)

#---- In[ ]:
import hashlib
def get_digest_sha1(filename):
    hasher = hashlib.sha1()
    blocksize = 65536
    with open(filename, 'rb') as f:
        while True:
            buf = f.read(blocksize)
            hasher.update(buf)
            if len(buf) < blocksize:
                break
    return hasher.hexdigest()

#---- In[ ]:
import hashlib
def get_digest_sha1_by_lines(filename):
    hasher = hashlib.sha1()
    blocksize = 65536
    with open(filename, 'rb') as f:
        for line in f:
            hasher.update(line)
    return hasher.hexdigest()

#---- In[ ]:
import xxhash
def get_digest_xxh64_order_sensitive(filename):
    h0 = xxhash.xxh64(seed=0)
    h1 = xxhash.xxh64(seed=1)
    blocksize = 65536
    with open(filename, 'rb') as f:
        while True:
            buf = f.read(blocksize)
            h0.update(buf)
            h1.update(buf)
            if len(buf) < blocksize:
                break
    return '{:032x}'.format((h0.intdigest() << 64) + h1.intdigest())

#---- In[ ]:
#: # Line order invariant digests

#---- In[ ]:
import hashlib
def get_digest_sha1_sort(filename):
    hasher = hashlib.sha1()
    with open(filename, 'r') as f:
        for line in sorted(f):
            hasher.update(line.encode())
    return hasher.hexdigest()

#---- In[ ]:
import hashlib
def get_digest_sha1_sort_binary(filename):
    hasher = hashlib.sha1()
    with open(filename, 'rb') as f:
        for line in sorted(f):
            hasher.update(line)
    return hasher.hexdigest()

#---- In[ ]:
def get_digest_hash_frozenset64(filename):
    with open(filename, 'rb') as f:
        frz = frozenset(f.readlines())
    return '{:032x}'.format(hash(frz))

#---- In[ ]:
def get_digest_hash_frozenset128(filename):
    with open(filename, 'rb') as f:
        frz = frozenset(f.readlines())
    return '{:032x}'.format(((hash(frz) << 64) + hash(frz.union('not a line'))) & 0xffffffffffffffffffffffffffffffff)

#---- In[ ]:
import mmh3
def get_digest_mmh3_128_add(filename):
    a = 0
    with open(filename, 'rb') as f:
        for line in f:
            a += mmh3.hash128(line)
    return '{:032x}'.format(a & 0xffffffffffffffffffffffffffffffff)

#---- In[ ]:
%%cython
import mmh3
def get_digest_mmh3_128_add_cy(filename):
    a = 0
    with open(filename, 'rb') as f:
        for line in f:
            a += mmh3.hash128(line)
    return '{:032x}'.format(a & 0xffffffffffffffffffffffffffffffff)

#---- In[ ]:
#: # Tests

#---- In[ ]:
import pandas as pd
import numpy as np

#---- In[ ]:
funs = [
    get_digest_sha1,
    get_digest_sha1_by_lines,
    get_digest_xxh64_order_sensitive,
    get_digest_sha1_sort,
    get_digest_sha1_sort_binary,
    get_digest_hash_frozenset64,
    get_digest_hash_frozenset128,
    get_digest_mmh3_128_add,
    get_digest_mmh3_128_add_cy,
]
sizes = [int(round(pow(2, k))) for k in np.arange(14, 24, .5)]

#---- In[ ]:
%%time
res = pd.DataFrame(columns=['fun', 'size', 'millis'])
filename = 'hashtest'
for size in sizes:
    write_random_file(filename, size)
    for fun in funs:
        ti = %timeit -o fun(filename)
        res = res.append(dict(fun=fun.__name__, size=size, millis=ti.best*1000),
                         ignore_index=True)
        print(res.tail(1))

#---- In[ ]:
z = res.copy()
z['y'] = 1E3 * z.millis  / z['size']
z['x'] = z['size'] / 1E6
z.pivot_table(index='x', values='y', columns='fun').plot(figsize=(15,8))
plt.xlabel('M lines')
plt.ylabel(r'$\mu \mathrm{s/line}$')
plt.grid()

#---- In[ ]:
z = res.copy()
z['y'] = 1E3 * z.millis  / z['size']
z['x'] = z['size'] / 1E6
cols = ['get_digest_hash_frozenset128', 'get_digest_mmh3_128_add_cy']
z.pivot_table(index='x', values='y', columns='fun')[cols].plot(figsize=(15,8))
plt.xlabel('M lines')
plt.ylabel(r'$\mu \mathrm{s/line}$')
plt.grid()

#---- In[ ]:
print(res[res['size'] == 46341].sort_values(by='millis'))

#---- In[ ]:
print(res[res['size'] == 11863283].sort_values(by='millis'))
	#---- In[ ]:
	%load_ext autoreload
	%load_ext Cython
	%autoreload 2
	%matplotlib inline

	import matplotlib.pyplot as plt
	import random
	import string

	#---- In[ ]:
	#: # Make a random file for testing

	#---- In[ ]:
	def write_random_file(filename, n_lines):
	with open(filename, 'w') as f:
	for i in range(n_lines):
	name = ''.join(random.sample(string.ascii_lowercase, 8))
	x = random.gauss(0, 1)
	print('{},{:.8f}'.format(name, x), file=f)

	#---- In[ ]:
	#: # Simple digests (sensitive to line order)

	#---- In[ ]:
	import hashlib
	def get_digest_sha1(filename):
	hasher = hashlib.sha1()
	blocksize = 65536
	with open(filename, 'rb') as f:
	while True:
	buf = f.read(blocksize)
	hasher.update(buf)
	if len(buf) < blocksize:
	break
	return hasher.hexdigest()

	#---- In[ ]:
	import hashlib
	def get_digest_sha1_by_lines(filename):
	hasher = hashlib.sha1()
	blocksize = 65536
	with open(filename, 'rb') as f:
	for line in f:
	hasher.update(line)
	return hasher.hexdigest()

	#---- In[ ]:
	import xxhash
	def get_digest_xxh64_order_sensitive(filename):
	h0 = xxhash.xxh64(seed=0)
	h1 = xxhash.xxh64(seed=1)
	blocksize = 65536
	with open(filename, 'rb') as f:
	while True:
	buf = f.read(blocksize)
	h0.update(buf)
	h1.update(buf)
	if len(buf) < blocksize:
	break
	return '{:032x}'.format((h0.intdigest() << 64) + h1.intdigest())

	#---- In[ ]:
	#: # Line order invariant digests

	#---- In[ ]:
	import hashlib
	def get_digest_sha1_sort(filename):
	hasher = hashlib.sha1()
	with open(filename, 'r') as f:
	for line in sorted(f):
	hasher.update(line.encode())
	return hasher.hexdigest()

	#---- In[ ]:
	import hashlib
	def get_digest_sha1_sort_binary(filename):
	hasher = hashlib.sha1()
	with open(filename, 'rb') as f:
	for line in sorted(f):
	hasher.update(line)
	return hasher.hexdigest()

	#---- In[ ]:
	def get_digest_hash_frozenset64(filename):
	with open(filename, 'rb') as f:
	frz = frozenset(f.readlines())
	return '{:032x}'.format(hash(frz))

	#---- In[ ]:
	def get_digest_hash_frozenset128(filename):
	with open(filename, 'rb') as f:
	frz = frozenset(f.readlines())
	return '{:032x}'.format(((hash(frz) << 64) + hash(frz.union('not a line'))) & 0xffffffffffffffffffffffffffffffff)

	#---- In[ ]:
	import mmh3
	def get_digest_mmh3_128_add(filename):
	a = 0
	with open(filename, 'rb') as f:
	for line in f:
	a += mmh3.hash128(line)
	return '{:032x}'.format(a & 0xffffffffffffffffffffffffffffffff)

	#---- In[ ]:
	%%cython
	import mmh3
	def get_digest_mmh3_128_add_cy(filename):
	a = 0
	with open(filename, 'rb') as f:
	for line in f:
	a += mmh3.hash128(line)
	return '{:032x}'.format(a & 0xffffffffffffffffffffffffffffffff)

	#---- In[ ]:
	#: # Tests

	#---- In[ ]:
	import pandas as pd
	import numpy as np

	#---- In[ ]:
	funs = [
	get_digest_sha1,
	get_digest_sha1_by_lines,
	get_digest_xxh64_order_sensitive,
	get_digest_sha1_sort,
	get_digest_sha1_sort_binary,
	get_digest_hash_frozenset64,
	get_digest_hash_frozenset128,
	get_digest_mmh3_128_add,
	get_digest_mmh3_128_add_cy,
	]
	sizes = [int(round(pow(2, k))) for k in np.arange(14, 24, .5)]

	#---- In[ ]:
	%%time
	res = pd.DataFrame(columns=['fun', 'size', 'millis'])
	filename = 'hashtest'
	for size in sizes:
	write_random_file(filename, size)
	for fun in funs:
	ti = %timeit -o fun(filename)
	res = res.append(dict(fun=fun.__name__, size=size, millis=ti.best*1000),
	ignore_index=True)
	print(res.tail(1))

	#---- In[ ]:
	z = res.copy()
	z['y'] = 1E3 * z.millis / z['size']
	z['x'] = z['size'] / 1E6
	z.pivot_table(index='x', values='y', columns='fun').plot(figsize=(15,8))
	plt.xlabel('M lines')
	plt.ylabel(r'$\mu \mathrm{s/line}$')
	plt.grid()

	#---- In[ ]:
	z = res.copy()
	z['y'] = 1E3 * z.millis / z['size']
	z['x'] = z['size'] / 1E6
	cols = ['get_digest_hash_frozenset128', 'get_digest_mmh3_128_add_cy']
	z.pivot_table(index='x', values='y', columns='fun')[cols].plot(figsize=(15,8))
	plt.xlabel('M lines')
	plt.ylabel(r'$\mu \mathrm{s/line}$')
	plt.grid()

	#---- In[ ]:
	print(res[res['size'] == 46341].sort_values(by='millis'))

	#---- In[ ]:
	print(res[res['size'] == 11863283].sort_values(by='millis'))