Created
February 25, 2017 05:52
-
-
Save pdemarti/dc394c53ae83816a64d45950cd1b086a to your computer and use it in GitHub Desktop.
Experiments for line-order invariant digest of a file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#---- In[ ]: | |
%load_ext autoreload | |
%load_ext Cython | |
%autoreload 2 | |
%matplotlib inline | |
import matplotlib.pyplot as plt | |
import random | |
import string | |
#---- In[ ]: | |
#: # Make a random file for testing | |
#---- In[ ]: | |
def write_random_file(filename, n_lines): | |
with open(filename, 'w') as f: | |
for i in range(n_lines): | |
name = ''.join(random.sample(string.ascii_lowercase, 8)) | |
x = random.gauss(0, 1) | |
print('{},{:.8f}'.format(name, x), file=f) | |
#---- In[ ]: | |
#: # Simple digests (sensitive to line order) | |
#---- In[ ]: | |
import hashlib | |
def get_digest_sha1(filename): | |
hasher = hashlib.sha1() | |
blocksize = 65536 | |
with open(filename, 'rb') as f: | |
while True: | |
buf = f.read(blocksize) | |
hasher.update(buf) | |
if len(buf) < blocksize: | |
break | |
return hasher.hexdigest() | |
#---- In[ ]: | |
import hashlib | |
def get_digest_sha1_by_lines(filename): | |
hasher = hashlib.sha1() | |
blocksize = 65536 | |
with open(filename, 'rb') as f: | |
for line in f: | |
hasher.update(line) | |
return hasher.hexdigest() | |
#---- In[ ]: | |
import xxhash | |
def get_digest_xxh64_order_sensitive(filename): | |
h0 = xxhash.xxh64(seed=0) | |
h1 = xxhash.xxh64(seed=1) | |
blocksize = 65536 | |
with open(filename, 'rb') as f: | |
while True: | |
buf = f.read(blocksize) | |
h0.update(buf) | |
h1.update(buf) | |
if len(buf) < blocksize: | |
break | |
return '{:032x}'.format((h0.intdigest() << 64) + h1.intdigest()) | |
#---- In[ ]: | |
#: # Line order invariant digests | |
#---- In[ ]: | |
import hashlib | |
def get_digest_sha1_sort(filename): | |
hasher = hashlib.sha1() | |
with open(filename, 'r') as f: | |
for line in sorted(f): | |
hasher.update(line.encode()) | |
return hasher.hexdigest() | |
#---- In[ ]: | |
import hashlib | |
def get_digest_sha1_sort_binary(filename): | |
hasher = hashlib.sha1() | |
with open(filename, 'rb') as f: | |
for line in sorted(f): | |
hasher.update(line) | |
return hasher.hexdigest() | |
#---- In[ ]: | |
def get_digest_hash_frozenset64(filename): | |
with open(filename, 'rb') as f: | |
frz = frozenset(f.readlines()) | |
return '{:032x}'.format(hash(frz)) | |
#---- In[ ]: | |
def get_digest_hash_frozenset128(filename): | |
with open(filename, 'rb') as f: | |
frz = frozenset(f.readlines()) | |
return '{:032x}'.format(((hash(frz) << 64) + hash(frz.union('not a line'))) & 0xffffffffffffffffffffffffffffffff) | |
#---- In[ ]: | |
import mmh3 | |
def get_digest_mmh3_128_add(filename): | |
a = 0 | |
with open(filename, 'rb') as f: | |
for line in f: | |
a += mmh3.hash128(line) | |
return '{:032x}'.format(a & 0xffffffffffffffffffffffffffffffff) | |
#---- In[ ]: | |
%%cython | |
import mmh3 | |
def get_digest_mmh3_128_add_cy(filename): | |
a = 0 | |
with open(filename, 'rb') as f: | |
for line in f: | |
a += mmh3.hash128(line) | |
return '{:032x}'.format(a & 0xffffffffffffffffffffffffffffffff) | |
#---- In[ ]: | |
#: # Tests | |
#---- In[ ]: | |
import pandas as pd | |
import numpy as np | |
#---- In[ ]: | |
funs = [ | |
get_digest_sha1, | |
get_digest_sha1_by_lines, | |
get_digest_xxh64_order_sensitive, | |
get_digest_sha1_sort, | |
get_digest_sha1_sort_binary, | |
get_digest_hash_frozenset64, | |
get_digest_hash_frozenset128, | |
get_digest_mmh3_128_add, | |
get_digest_mmh3_128_add_cy, | |
] | |
sizes = [int(round(pow(2, k))) for k in np.arange(14, 24, .5)] | |
#---- In[ ]: | |
%%time | |
res = pd.DataFrame(columns=['fun', 'size', 'millis']) | |
filename = 'hashtest' | |
for size in sizes: | |
write_random_file(filename, size) | |
for fun in funs: | |
ti = %timeit -o fun(filename) | |
res = res.append(dict(fun=fun.__name__, size=size, millis=ti.best*1000), | |
ignore_index=True) | |
print(res.tail(1)) | |
#---- In[ ]: | |
z = res.copy() | |
z['y'] = 1E3 * z.millis / z['size'] | |
z['x'] = z['size'] / 1E6 | |
z.pivot_table(index='x', values='y', columns='fun').plot(figsize=(15,8)) | |
plt.xlabel('M lines') | |
plt.ylabel(r'$\mu \mathrm{s/line}$') | |
plt.grid() | |
#---- In[ ]: | |
z = res.copy() | |
z['y'] = 1E3 * z.millis / z['size'] | |
z['x'] = z['size'] / 1E6 | |
cols = ['get_digest_hash_frozenset128', 'get_digest_mmh3_128_add_cy'] | |
z.pivot_table(index='x', values='y', columns='fun')[cols].plot(figsize=(15,8)) | |
plt.xlabel('M lines') | |
plt.ylabel(r'$\mu \mathrm{s/line}$') | |
plt.grid() | |
#---- In[ ]: | |
print(res[res['size'] == 46341].sort_values(by='millis')) | |
#---- In[ ]: | |
print(res[res['size'] == 11863283].sort_values(by='millis')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment