-
-
Save jszym/7f9fff9f3c7be3f5fe381761ab1c8e37 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from Crypto.Hash import KangarooTwelve as K12 | |
import hashlib | |
from time import time | |
from blake3 import blake3 | |
import xxhash | |
import mmh3 | |
import metrohash | |
from skein import skein256 | |
from tqdm import tqdm | |
from random import randbytes | |
import csv | |
import numpy as np | |
def murmurhash3x64_128_hash(data): | |
hasher = mmh3.mmh3_x64_128() | |
hasher.update(data) | |
return hasher.digest() | |
def skein256_hash(data): | |
return skein256(data).digest() | |
def metrohash128_hash(data): | |
return metrohash.hash128(data) | |
def metrohash64_hash(data): | |
return metrohash.hash64(data) | |
def xxh32_hash(data): | |
return xxhash.xxh32(data).digest() | |
def xxh64_hash(data): | |
return xxhash.xxh64(data).digest() | |
def md5_hash(data): | |
return hashlib.md5(data).digest() | |
def k12_hash(data): | |
return K12.new(data).read(32) | |
def sha3_hash(data): | |
return hashlib.sha3_256(data).digest() | |
def sha256_hash(data): | |
return hashlib.sha256(data).digest() | |
def sha1_hash(data): | |
return hashlib.sha1(data).digest() | |
def blake2b_hash(data): | |
return hashlib.blake2b(data).digest() | |
def blake2s_hash(data): | |
return hashlib.blake2s(data).digest() | |
def blake3_hash(data): | |
return blake3(data).digest() | |
def word_bench(hashfn, reps): | |
times = [] | |
corpi = [] | |
corpi_path = [ | |
'sci.crypt.txt', | |
'odessey.txt', | |
'wikinews.txt' | |
] | |
n_lines = 0 | |
for corpus_path in corpi_path: | |
for line in open(corpus_path): | |
n_lines += 1 | |
pbar = tqdm(total=reps*n_lines) | |
for _ in range(reps): | |
for corpus_path in corpi_path: | |
lines = [] | |
for line in open(corpus_path): | |
lines.append(line) | |
start = time() | |
for line in lines: | |
for word in line.strip().split(' '): | |
hashfn(word.encode('utf8')) | |
pbar.update(1) | |
end = time() - start | |
times.append(end) | |
corpi.append(corpus_path) | |
pbar.close() | |
return times, corpi | |
WORD_NBYTES = 5 | |
KB_NBYTES = 1000 | |
MB_NBYTES = KB_NBYTES**2 | |
GB_NBYTES = KB_NBYTES**3 | |
def rand_bench(hashfn, test_n_bytes=[WORD_NBYTES, KB_NBYTES, MB_NBYTES, GB_NBYTES], reps=100): | |
n_bytes = [] | |
times = [] | |
pbar = tqdm(total=len(test_n_bytes)*reps) | |
for n in test_n_bytes: | |
total_time = 0 | |
for _ in range(reps): | |
if n < MB_NBYTES: | |
x = randbytes(n) | |
else: | |
# randbytes hits overflow problems at very large values of n | |
x = bytes() | |
for _ in range(n//MB_NBYTES): | |
x += randbytes(MB_NBYTES) | |
start = time() | |
hashfn(x) | |
end = time() - start | |
total_time += end | |
pbar.update(1) | |
n_bytes.append(n) | |
times.append(end) | |
pbar.close() | |
return n_bytes, times | |
hashfn_dict = { | |
"murmurhash3x64_128": murmurhash3x64_128_hash, | |
"skein256": skein256_hash, | |
"metrohash128": metrohash128_hash, | |
"metrohash64": metrohash64_hash, | |
"xxh32": xxh32_hash, | |
"xxh64": xxh64_hash, | |
"md5": md5_hash, | |
"k12": k12_hash, | |
"sha3": sha3_hash, | |
"sha256": sha256_hash, | |
"sha1": sha1_hash, | |
"blake2b": blake2b_hash, | |
"blake2s": blake2s_hash, | |
"blake3": blake3_hash | |
} | |
word_table = [] | |
for hashfn_n, hashfn_name in enumerate(hashfn_dict): | |
print(f"[{hashfn_n}/{len(hashfn_dict)}] - {hashfn_name}") | |
hashfn = hashfn_dict[hashfn_name] | |
times, corpi = word_bench(hashfn, 100) | |
for t, corpus in zip(times, corpi): | |
word_table.append({ | |
'hashfn': hashfn_name, | |
'time': t, | |
'corpus': corpus | |
}) | |
random_small_table = [] | |
for hashfn_n, hashfn_name in enumerate(hashfn_dict): | |
hashfn = hashfn_dict[hashfn_name] | |
reps = 100 | |
print(f"[{hashfn_n}/{len(hashfn_dict)}] - {hashfn_name}") | |
n_bytes, times = rand_bench(hashfn, test_n_bytes=np.linspace(1, KB_NBYTES, 1000).astype(np.int64), reps=reps) | |
for n_byte, t in zip(n_bytes, times): | |
random_small_table.append({ | |
'hashfn': hashfn_name, | |
'n_bytes': n_byte, | |
'time': t, | |
'reps': reps | |
}) | |
random_table = [] | |
for hashfn_n, hashfn_name in enumerate(hashfn_dict): | |
hashfn = hashfn_dict[hashfn_name] | |
reps = 100 | |
print(f"[{hashfn_n}/{len(hashfn_dict)}] - {hashfn_name}") | |
n_bytes, times = rand_bench(hashfn, test_n_bytes=np.linspace(1, GB_NBYTES, 1000).astype(np.int64), reps=reps) | |
for n_byte, t in zip(n_bytes, times): | |
random_small_table.append({ | |
'hashfn': hashfn_name, | |
'n_bytes': n_byte, | |
'time': t, | |
'reps': reps | |
}) | |
for path, table in zip(['word_bench.csv', 'rand_bench.csv', 'rand_small_bench.csv'], [word_table, random_table, random_small_table]): | |
with open(path, 'w', newline='') as csvfile: | |
fieldnames = list(table[0].keys()) | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for row in table: | |
writer.writerow(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment