Skip to content

Instantly share code, notes, and snippets.

@jszym

jszym/bench.py Secret

Created January 15, 2024 00:35
Show Gist options
  • Save jszym/7f9fff9f3c7be3f5fe381761ab1c8e37 to your computer and use it in GitHub Desktop.
Save jszym/7f9fff9f3c7be3f5fe381761ab1c8e37 to your computer and use it in GitHub Desktop.
from Crypto.Hash import KangarooTwelve as K12
import hashlib
from time import time
from blake3 import blake3
import xxhash
import mmh3
import metrohash
from skein import skein256
from tqdm import tqdm
from random import randbytes
import csv
import numpy as np
def murmurhash3x64_128_hash(data):
hasher = mmh3.mmh3_x64_128()
hasher.update(data)
return hasher.digest()
def skein256_hash(data):
return skein256(data).digest()
def metrohash128_hash(data):
return metrohash.hash128(data)
def metrohash64_hash(data):
return metrohash.hash64(data)
def xxh32_hash(data):
return xxhash.xxh32(data).digest()
def xxh64_hash(data):
return xxhash.xxh64(data).digest()
def md5_hash(data):
return hashlib.md5(data).digest()
def k12_hash(data):
return K12.new(data).read(32)
def sha3_hash(data):
return hashlib.sha3_256(data).digest()
def sha256_hash(data):
return hashlib.sha256(data).digest()
def sha1_hash(data):
return hashlib.sha1(data).digest()
def blake2b_hash(data):
return hashlib.blake2b(data).digest()
def blake2s_hash(data):
return hashlib.blake2s(data).digest()
def blake3_hash(data):
return blake3(data).digest()
def word_bench(hashfn, reps):
times = []
corpi = []
corpi_path = [
'sci.crypt.txt',
'odessey.txt',
'wikinews.txt'
]
n_lines = 0
for corpus_path in corpi_path:
for line in open(corpus_path):
n_lines += 1
pbar = tqdm(total=reps*n_lines)
for _ in range(reps):
for corpus_path in corpi_path:
lines = []
for line in open(corpus_path):
lines.append(line)
start = time()
for line in lines:
for word in line.strip().split(' '):
hashfn(word.encode('utf8'))
pbar.update(1)
end = time() - start
times.append(end)
corpi.append(corpus_path)
pbar.close()
return times, corpi
WORD_NBYTES = 5
KB_NBYTES = 1000
MB_NBYTES = KB_NBYTES**2
GB_NBYTES = KB_NBYTES**3
def rand_bench(hashfn, test_n_bytes=[WORD_NBYTES, KB_NBYTES, MB_NBYTES, GB_NBYTES], reps=100):
n_bytes = []
times = []
pbar = tqdm(total=len(test_n_bytes)*reps)
for n in test_n_bytes:
total_time = 0
for _ in range(reps):
if n < MB_NBYTES:
x = randbytes(n)
else:
# randbytes hits overflow problems at very large values of n
x = bytes()
for _ in range(n//MB_NBYTES):
x += randbytes(MB_NBYTES)
start = time()
hashfn(x)
end = time() - start
total_time += end
pbar.update(1)
n_bytes.append(n)
times.append(end)
pbar.close()
return n_bytes, times
hashfn_dict = {
"murmurhash3x64_128": murmurhash3x64_128_hash,
"skein256": skein256_hash,
"metrohash128": metrohash128_hash,
"metrohash64": metrohash64_hash,
"xxh32": xxh32_hash,
"xxh64": xxh64_hash,
"md5": md5_hash,
"k12": k12_hash,
"sha3": sha3_hash,
"sha256": sha256_hash,
"sha1": sha1_hash,
"blake2b": blake2b_hash,
"blake2s": blake2s_hash,
"blake3": blake3_hash
}
word_table = []
for hashfn_n, hashfn_name in enumerate(hashfn_dict):
print(f"[{hashfn_n}/{len(hashfn_dict)}] - {hashfn_name}")
hashfn = hashfn_dict[hashfn_name]
times, corpi = word_bench(hashfn, 100)
for t, corpus in zip(times, corpi):
word_table.append({
'hashfn': hashfn_name,
'time': t,
'corpus': corpus
})
random_small_table = []
for hashfn_n, hashfn_name in enumerate(hashfn_dict):
hashfn = hashfn_dict[hashfn_name]
reps = 100
print(f"[{hashfn_n}/{len(hashfn_dict)}] - {hashfn_name}")
n_bytes, times = rand_bench(hashfn, test_n_bytes=np.linspace(1, KB_NBYTES, 1000).astype(np.int64), reps=reps)
for n_byte, t in zip(n_bytes, times):
random_small_table.append({
'hashfn': hashfn_name,
'n_bytes': n_byte,
'time': t,
'reps': reps
})
random_table = []
for hashfn_n, hashfn_name in enumerate(hashfn_dict):
hashfn = hashfn_dict[hashfn_name]
reps = 100
print(f"[{hashfn_n}/{len(hashfn_dict)}] - {hashfn_name}")
n_bytes, times = rand_bench(hashfn, test_n_bytes=np.linspace(1, GB_NBYTES, 1000).astype(np.int64), reps=reps)
for n_byte, t in zip(n_bytes, times):
random_small_table.append({
'hashfn': hashfn_name,
'n_bytes': n_byte,
'time': t,
'reps': reps
})
for path, table in zip(['word_bench.csv', 'rand_bench.csv', 'rand_small_bench.csv'], [word_table, random_table, random_small_table]):
with open(path, 'w', newline='') as csvfile:
fieldnames = list(table[0].keys())
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in table:
writer.writerow(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment