jszym/bench.py Secret

## bench.py
from Crypto.Hash import KangarooTwelve as K12
import hashlib
from time import time
from blake3 import blake3
import xxhash
import mmh3
import metrohash
from skein import skein256
from tqdm import tqdm
from random import randbytes
import csv
import numpy as np


def murmurhash3x64_128_hash(data):
	hasher = mmh3.mmh3_x64_128()
	hasher.update(data)
	return hasher.digest()

def skein256_hash(data):
	return skein256(data).digest()

def metrohash128_hash(data):
	return metrohash.hash128(data)

def metrohash64_hash(data):
	return metrohash.hash64(data)

def xxh32_hash(data):
	return xxhash.xxh32(data).digest()

def xxh64_hash(data):
	return xxhash.xxh64(data).digest()

def md5_hash(data):
	return hashlib.md5(data).digest()

def k12_hash(data):
	return K12.new(data).read(32)

def sha3_hash(data):
	return hashlib.sha3_256(data).digest()

def sha256_hash(data):
	return hashlib.sha256(data).digest()

def sha1_hash(data):
	return hashlib.sha1(data).digest()

def blake2b_hash(data):
	return hashlib.blake2b(data).digest()

def blake2s_hash(data):
	return hashlib.blake2s(data).digest()

def blake3_hash(data):
	return blake3(data).digest()

def word_bench(hashfn, reps):
	times = []
	corpi = []

	corpi_path = [
			'sci.crypt.txt',
			'odessey.txt',
			'wikinews.txt'
		]

	n_lines = 0
	for corpus_path in corpi_path:
		for line in open(corpus_path):
				n_lines += 1

	pbar = tqdm(total=reps*n_lines)

	for _ in range(reps):
		for corpus_path in corpi_path:

			lines = []

			for line in open(corpus_path):
				lines.append(line)

			start = time()

			for line in lines:
				for word in line.strip().split(' '):
					hashfn(word.encode('utf8'))
				pbar.update(1)

			end = time() - start

			times.append(end)
			corpi.append(corpus_path)

	pbar.close()

	return times, corpi

WORD_NBYTES = 5
KB_NBYTES = 1000
MB_NBYTES = KB_NBYTES**2
GB_NBYTES = KB_NBYTES**3

def rand_bench(hashfn, test_n_bytes=[WORD_NBYTES, KB_NBYTES, MB_NBYTES, GB_NBYTES], reps=100):

	n_bytes = []
	times = []

	pbar = tqdm(total=len(test_n_bytes)*reps)

	for n in test_n_bytes:
		total_time = 0

		for _ in range(reps):

			if n < MB_NBYTES:
				x = randbytes(n)
			else:
				# randbytes hits overflow problems at very large values of n
				x = bytes()
				for _ in range(n//MB_NBYTES):
					x += randbytes(MB_NBYTES)

			start = time()
			hashfn(x)
			end = time() - start
			total_time += end
			pbar.update(1)

			n_bytes.append(n)
			times.append(end)

	pbar.close()

	return n_bytes, times


hashfn_dict = {
	"murmurhash3x64_128": murmurhash3x64_128_hash,
	"skein256": skein256_hash,
	"metrohash128": metrohash128_hash,
	"metrohash64": metrohash64_hash,
	"xxh32": xxh32_hash,
	"xxh64": xxh64_hash,
	"md5": md5_hash,
	"k12": k12_hash,
	"sha3": sha3_hash,
	"sha256": sha256_hash,
	"sha1": sha1_hash,
	"blake2b": blake2b_hash,
	"blake2s": blake2s_hash,
	"blake3": blake3_hash
}

word_table = []

for hashfn_n, hashfn_name in enumerate(hashfn_dict):
	print(f"[{hashfn_n}/{len(hashfn_dict)}] - {hashfn_name}")
	hashfn = hashfn_dict[hashfn_name]
	times, corpi = word_bench(hashfn, 100)

	for t, corpus in zip(times, corpi):
		word_table.append({
			'hashfn': hashfn_name,
			'time': t,
			'corpus': corpus
			})

random_small_table = []

for hashfn_n, hashfn_name in enumerate(hashfn_dict):
	hashfn = hashfn_dict[hashfn_name]

	reps = 100

	print(f"[{hashfn_n}/{len(hashfn_dict)}] - {hashfn_name}")
	n_bytes, times = rand_bench(hashfn, test_n_bytes=np.linspace(1, KB_NBYTES, 1000).astype(np.int64), reps=reps)

	for n_byte, t in zip(n_bytes, times):
		random_small_table.append({
			'hashfn': hashfn_name,
			'n_bytes': n_byte,
			'time': t,
			'reps': reps
		})

random_table = []

for hashfn_n, hashfn_name in enumerate(hashfn_dict):
	hashfn = hashfn_dict[hashfn_name]

	reps = 100

	print(f"[{hashfn_n}/{len(hashfn_dict)}] - {hashfn_name}")
	n_bytes, times = rand_bench(hashfn, test_n_bytes=np.linspace(1, GB_NBYTES, 1000).astype(np.int64), reps=reps)

	for n_byte, t in zip(n_bytes, times):
		random_small_table.append({
			'hashfn': hashfn_name,
			'n_bytes': n_byte,
			'time': t,
			'reps': reps
		})

for path, table in zip(['word_bench.csv', 'rand_bench.csv', 'rand_small_bench.csv'], [word_table, random_table, random_small_table]):
	with open(path, 'w', newline='') as csvfile:
		fieldnames = list(table[0].keys())
		writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

		writer.writeheader()
		for row in table:
			writer.writerow(row)
	from Crypto.Hash import KangarooTwelve as K12
	import hashlib
	from time import time
	from blake3 import blake3
	import xxhash
	import mmh3
	import metrohash
	from skein import skein256
	from tqdm import tqdm
	from random import randbytes
	import csv
	import numpy as np


	def murmurhash3x64_128_hash(data):
	hasher = mmh3.mmh3_x64_128()
	hasher.update(data)
	return hasher.digest()

	def skein256_hash(data):
	return skein256(data).digest()

	def metrohash128_hash(data):
	return metrohash.hash128(data)

	def metrohash64_hash(data):
	return metrohash.hash64(data)

	def xxh32_hash(data):
	return xxhash.xxh32(data).digest()

	def xxh64_hash(data):
	return xxhash.xxh64(data).digest()

	def md5_hash(data):
	return hashlib.md5(data).digest()

	def k12_hash(data):
	return K12.new(data).read(32)

	def sha3_hash(data):
	return hashlib.sha3_256(data).digest()

	def sha256_hash(data):
	return hashlib.sha256(data).digest()

	def sha1_hash(data):
	return hashlib.sha1(data).digest()

	def blake2b_hash(data):
	return hashlib.blake2b(data).digest()

	def blake2s_hash(data):
	return hashlib.blake2s(data).digest()

	def blake3_hash(data):
	return blake3(data).digest()

	def word_bench(hashfn, reps):
	times = []
	corpi = []

	corpi_path = [
	'sci.crypt.txt',
	'odessey.txt',
	'wikinews.txt'
	]

	n_lines = 0
	for corpus_path in corpi_path:
	for line in open(corpus_path):
	n_lines += 1

	pbar = tqdm(total=reps*n_lines)

	for _ in range(reps):
	for corpus_path in corpi_path:

	lines = []

	for line in open(corpus_path):
	lines.append(line)

	start = time()

	for line in lines:
	for word in line.strip().split(' '):
	hashfn(word.encode('utf8'))
	pbar.update(1)

	end = time() - start

	times.append(end)
	corpi.append(corpus_path)

	pbar.close()

	return times, corpi

	WORD_NBYTES = 5
	KB_NBYTES = 1000
	MB_NBYTES = KB_NBYTES**2
	GB_NBYTES = KB_NBYTES**3

	def rand_bench(hashfn, test_n_bytes=[WORD_NBYTES, KB_NBYTES, MB_NBYTES, GB_NBYTES], reps=100):

	n_bytes = []
	times = []

	pbar = tqdm(total=len(test_n_bytes)*reps)

	for n in test_n_bytes:
	total_time = 0

	for _ in range(reps):

	if n < MB_NBYTES:
	x = randbytes(n)
	else:
	# randbytes hits overflow problems at very large values of n
	x = bytes()
	for _ in range(n//MB_NBYTES):
	x += randbytes(MB_NBYTES)

	start = time()
	hashfn(x)
	end = time() - start
	total_time += end
	pbar.update(1)

	n_bytes.append(n)
	times.append(end)

	pbar.close()

	return n_bytes, times


	hashfn_dict = {
	"murmurhash3x64_128": murmurhash3x64_128_hash,
	"skein256": skein256_hash,
	"metrohash128": metrohash128_hash,
	"metrohash64": metrohash64_hash,
	"xxh32": xxh32_hash,
	"xxh64": xxh64_hash,
	"md5": md5_hash,
	"k12": k12_hash,
	"sha3": sha3_hash,
	"sha256": sha256_hash,
	"sha1": sha1_hash,
	"blake2b": blake2b_hash,
	"blake2s": blake2s_hash,
	"blake3": blake3_hash
	}

	word_table = []

	for hashfn_n, hashfn_name in enumerate(hashfn_dict):
	print(f"[{hashfn_n}/{len(hashfn_dict)}] - {hashfn_name}")
	hashfn = hashfn_dict[hashfn_name]
	times, corpi = word_bench(hashfn, 100)

	for t, corpus in zip(times, corpi):
	word_table.append({
	'hashfn': hashfn_name,
	'time': t,
	'corpus': corpus
	})

	random_small_table = []

	for hashfn_n, hashfn_name in enumerate(hashfn_dict):
	hashfn = hashfn_dict[hashfn_name]

	reps = 100

	print(f"[{hashfn_n}/{len(hashfn_dict)}] - {hashfn_name}")
	n_bytes, times = rand_bench(hashfn, test_n_bytes=np.linspace(1, KB_NBYTES, 1000).astype(np.int64), reps=reps)

	for n_byte, t in zip(n_bytes, times):
	random_small_table.append({
	'hashfn': hashfn_name,
	'n_bytes': n_byte,
	'time': t,
	'reps': reps
	})

	random_table = []

	for hashfn_n, hashfn_name in enumerate(hashfn_dict):
	hashfn = hashfn_dict[hashfn_name]

	reps = 100

	print(f"[{hashfn_n}/{len(hashfn_dict)}] - {hashfn_name}")
	n_bytes, times = rand_bench(hashfn, test_n_bytes=np.linspace(1, GB_NBYTES, 1000).astype(np.int64), reps=reps)

	for n_byte, t in zip(n_bytes, times):
	random_small_table.append({
	'hashfn': hashfn_name,
	'n_bytes': n_byte,
	'time': t,
	'reps': reps
	})

	for path, table in zip(['word_bench.csv', 'rand_bench.csv', 'rand_small_bench.csv'], [word_table, random_table, random_small_table]):
	with open(path, 'w', newline='') as csvfile:
	fieldnames = list(table[0].keys())
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

	writer.writeheader()
	for row in table:
	writer.writerow(row)