Skip to content

Instantly share code, notes, and snippets.

@antoncohen
Created October 24, 2016 00:52
Show Gist options
  • Save antoncohen/11fca2b1723c3d201282cc070fbd0bf1 to your computer and use it in GitHub Desktop.
Save antoncohen/11fca2b1723c3d201282cc070fbd0bf1 to your computer and use it in GitHub Desktop.
fhash.py
#!/usr/bin/env python
"""Hash files, like md5sum or shasum, only faster. Works with all
hash functions Python's hashlib supports."""
from __future__ import print_function
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from hashlib import algorithms_available
from hashlib import new as new_hash
from io import open
from multiprocessing import Pool, cpu_count
from multiprocessing.dummy import Pool as ThreadPool
from sys import stderr
class HashFile(object):
def __init__(self, algo='sha1'):
self.algo = algo
def __call__(self, file_path):
return self.hash_file(file_path)
def hash_file(self, file_path):
file_hash = new_hash(self.algo)
one_mb = 1024 * 1024
# Protect against zero or no block_size
block_size = max(file_hash.block_size, 1)
# Feed ~1 MiB at a time to the hash algo, as a multiple of block_size
buf_size = block_size * (one_mb // block_size)
try:
with open(file_path, 'rb') as f:
f.seek(0)
while True:
buf = f.read(buf_size)
if buf:
file_hash.update(buf)
else:
break
except IOError as e:
print(e, file=stderr)
return None, file_path
return file_hash.hexdigest(), file_path
class FileHashes(object):
def __init__(
self, files=None, thread_count=None, use_real_procs=False,
use_openssl=False, algo='sha1'
):
self.files = files if files else []
self.thread_count = thread_count if thread_count else max(cpu_count() // 2, 1)
self.use_real_procs = use_real_procs
self.use_openssl = use_openssl
self.algo = algo
def hashes(self):
if self.use_real_procs:
pool = Pool(self.thread_count)
else:
pool = ThreadPool(self.thread_count)
# Pool.imap can't pickle functions with non-dummy subprocesses,
# but it can pickle instances of classes.
# That is why the hash function is implemented as callabled class instead
# of class methods.
hasher = HashFile(self.algo)
for file_hash, file_path in pool.imap(hasher, self.files):
if file_hash:
yield file_hash, file_path
class CommandLine(object):
def __call__(self):
self.args = self._args()
file_hashes = FileHashes(
files=self.args.files,
thread_count=self.args.thread_count,
use_real_procs=self.args.use_real_procs,
algo=self.args.algo,
)
for file_hash, file_path in file_hashes.hashes():
print(file_path, '=', file_hash)
def _args(self):
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('files', nargs='+')
parser.add_argument(
'-a', '--algorithm', dest='algo',
default='sha1', choices=sorted(algorithms_available),
help='Hash algorithm to use'
)
parser.add_argument(
'-t', '--threads', dest='thread_count',
type=int, default=max(cpu_count() // 2, 1),
help='Number of threads or processes to use'
)
parser.add_argument(
'-p', '--real-processes', dest='use_real_procs', action='store_true',
help='Use full processes instead of threads'
)
args = parser.parse_args()
return args
if __name__ == '__main__':
cmd = CommandLine()
cmd()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment