Created
October 24, 2016 00:52
-
-
Save antoncohen/11fca2b1723c3d201282cc070fbd0bf1 to your computer and use it in GitHub Desktop.
fhash.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Hash files, like md5sum or shasum, only faster. Works with all | |
hash functions Python's hashlib supports.""" | |
from __future__ import print_function | |
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter | |
from hashlib import algorithms_available | |
from hashlib import new as new_hash | |
from io import open | |
from multiprocessing import Pool, cpu_count | |
from multiprocessing.dummy import Pool as ThreadPool | |
from sys import stderr | |
class HashFile(object): | |
def __init__(self, algo='sha1'): | |
self.algo = algo | |
def __call__(self, file_path): | |
return self.hash_file(file_path) | |
def hash_file(self, file_path): | |
file_hash = new_hash(self.algo) | |
one_mb = 1024 * 1024 | |
# Protect against zero or no block_size | |
block_size = max(file_hash.block_size, 1) | |
# Feed ~1 MiB at a time to the hash algo, as a multiple of block_size | |
buf_size = block_size * (one_mb // block_size) | |
try: | |
with open(file_path, 'rb') as f: | |
f.seek(0) | |
while True: | |
buf = f.read(buf_size) | |
if buf: | |
file_hash.update(buf) | |
else: | |
break | |
except IOError as e: | |
print(e, file=stderr) | |
return None, file_path | |
return file_hash.hexdigest(), file_path | |
class FileHashes(object): | |
def __init__( | |
self, files=None, thread_count=None, use_real_procs=False, | |
use_openssl=False, algo='sha1' | |
): | |
self.files = files if files else [] | |
self.thread_count = thread_count if thread_count else max(cpu_count() // 2, 1) | |
self.use_real_procs = use_real_procs | |
self.use_openssl = use_openssl | |
self.algo = algo | |
def hashes(self): | |
if self.use_real_procs: | |
pool = Pool(self.thread_count) | |
else: | |
pool = ThreadPool(self.thread_count) | |
# Pool.imap can't pickle functions with non-dummy subprocesses, | |
# but it can pickle instances of classes. | |
# That is why the hash function is implemented as callabled class instead | |
# of class methods. | |
hasher = HashFile(self.algo) | |
for file_hash, file_path in pool.imap(hasher, self.files): | |
if file_hash: | |
yield file_hash, file_path | |
class CommandLine(object): | |
def __call__(self): | |
self.args = self._args() | |
file_hashes = FileHashes( | |
files=self.args.files, | |
thread_count=self.args.thread_count, | |
use_real_procs=self.args.use_real_procs, | |
algo=self.args.algo, | |
) | |
for file_hash, file_path in file_hashes.hashes(): | |
print(file_path, '=', file_hash) | |
def _args(self): | |
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) | |
parser.add_argument('files', nargs='+') | |
parser.add_argument( | |
'-a', '--algorithm', dest='algo', | |
default='sha1', choices=sorted(algorithms_available), | |
help='Hash algorithm to use' | |
) | |
parser.add_argument( | |
'-t', '--threads', dest='thread_count', | |
type=int, default=max(cpu_count() // 2, 1), | |
help='Number of threads or processes to use' | |
) | |
parser.add_argument( | |
'-p', '--real-processes', dest='use_real_procs', action='store_true', | |
help='Use full processes instead of threads' | |
) | |
args = parser.parse_args() | |
return args | |
if __name__ == '__main__': | |
cmd = CommandLine() | |
cmd() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment