Skip to content

Instantly share code, notes, and snippets.

@pdemarti
Created February 25, 2017 05:52
Show Gist options
  • Save pdemarti/dc394c53ae83816a64d45950cd1b086a to your computer and use it in GitHub Desktop.
Save pdemarti/dc394c53ae83816a64d45950cd1b086a to your computer and use it in GitHub Desktop.
Experiments for line-order invariant digest of a file
#---- In[ ]:
%load_ext autoreload
%load_ext Cython
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
import random
import string
#---- In[ ]:
#: # Make a random file for testing
#---- In[ ]:
def write_random_file(filename, n_lines):
with open(filename, 'w') as f:
for i in range(n_lines):
name = ''.join(random.sample(string.ascii_lowercase, 8))
x = random.gauss(0, 1)
print('{},{:.8f}'.format(name, x), file=f)
#---- In[ ]:
#: # Simple digests (sensitive to line order)
#---- In[ ]:
import hashlib
def get_digest_sha1(filename):
hasher = hashlib.sha1()
blocksize = 65536
with open(filename, 'rb') as f:
while True:
buf = f.read(blocksize)
hasher.update(buf)
if len(buf) < blocksize:
break
return hasher.hexdigest()
#---- In[ ]:
import hashlib
def get_digest_sha1_by_lines(filename):
hasher = hashlib.sha1()
blocksize = 65536
with open(filename, 'rb') as f:
for line in f:
hasher.update(line)
return hasher.hexdigest()
#---- In[ ]:
import xxhash
def get_digest_xxh64_order_sensitive(filename):
h0 = xxhash.xxh64(seed=0)
h1 = xxhash.xxh64(seed=1)
blocksize = 65536
with open(filename, 'rb') as f:
while True:
buf = f.read(blocksize)
h0.update(buf)
h1.update(buf)
if len(buf) < blocksize:
break
return '{:032x}'.format((h0.intdigest() << 64) + h1.intdigest())
#---- In[ ]:
#: # Line order invariant digests
#---- In[ ]:
import hashlib
def get_digest_sha1_sort(filename):
hasher = hashlib.sha1()
with open(filename, 'r') as f:
for line in sorted(f):
hasher.update(line.encode())
return hasher.hexdigest()
#---- In[ ]:
import hashlib
def get_digest_sha1_sort_binary(filename):
hasher = hashlib.sha1()
with open(filename, 'rb') as f:
for line in sorted(f):
hasher.update(line)
return hasher.hexdigest()
#---- In[ ]:
def get_digest_hash_frozenset64(filename):
with open(filename, 'rb') as f:
frz = frozenset(f.readlines())
return '{:032x}'.format(hash(frz))
#---- In[ ]:
def get_digest_hash_frozenset128(filename):
with open(filename, 'rb') as f:
frz = frozenset(f.readlines())
return '{:032x}'.format(((hash(frz) << 64) + hash(frz.union('not a line'))) & 0xffffffffffffffffffffffffffffffff)
#---- In[ ]:
import mmh3
def get_digest_mmh3_128_add(filename):
a = 0
with open(filename, 'rb') as f:
for line in f:
a += mmh3.hash128(line)
return '{:032x}'.format(a & 0xffffffffffffffffffffffffffffffff)
#---- In[ ]:
%%cython
import mmh3
def get_digest_mmh3_128_add_cy(filename):
a = 0
with open(filename, 'rb') as f:
for line in f:
a += mmh3.hash128(line)
return '{:032x}'.format(a & 0xffffffffffffffffffffffffffffffff)
#---- In[ ]:
#: # Tests
#---- In[ ]:
import pandas as pd
import numpy as np
#---- In[ ]:
funs = [
get_digest_sha1,
get_digest_sha1_by_lines,
get_digest_xxh64_order_sensitive,
get_digest_sha1_sort,
get_digest_sha1_sort_binary,
get_digest_hash_frozenset64,
get_digest_hash_frozenset128,
get_digest_mmh3_128_add,
get_digest_mmh3_128_add_cy,
]
sizes = [int(round(pow(2, k))) for k in np.arange(14, 24, .5)]
#---- In[ ]:
%%time
res = pd.DataFrame(columns=['fun', 'size', 'millis'])
filename = 'hashtest'
for size in sizes:
write_random_file(filename, size)
for fun in funs:
ti = %timeit -o fun(filename)
res = res.append(dict(fun=fun.__name__, size=size, millis=ti.best*1000),
ignore_index=True)
print(res.tail(1))
#---- In[ ]:
z = res.copy()
z['y'] = 1E3 * z.millis / z['size']
z['x'] = z['size'] / 1E6
z.pivot_table(index='x', values='y', columns='fun').plot(figsize=(15,8))
plt.xlabel('M lines')
plt.ylabel(r'$\mu \mathrm{s/line}$')
plt.grid()
#---- In[ ]:
z = res.copy()
z['y'] = 1E3 * z.millis / z['size']
z['x'] = z['size'] / 1E6
cols = ['get_digest_hash_frozenset128', 'get_digest_mmh3_128_add_cy']
z.pivot_table(index='x', values='y', columns='fun')[cols].plot(figsize=(15,8))
plt.xlabel('M lines')
plt.ylabel(r'$\mu \mathrm{s/line}$')
plt.grid()
#---- In[ ]:
print(res[res['size'] == 46341].sort_values(by='millis'))
#---- In[ ]:
print(res[res['size'] == 11863283].sort_values(by='millis'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment