antoncohen/fhash.py

## fhash.py
#!/usr/bin/env python

"""Hash files, like md5sum or shasum, only faster. Works with all
hash functions Python's hashlib supports."""

from __future__ import print_function
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from hashlib import algorithms_available
from hashlib import new as new_hash
from io import open
from multiprocessing import Pool, cpu_count
from multiprocessing.dummy import Pool as ThreadPool
from sys import stderr


class HashFile(object):
    def __init__(self, algo='sha1'):
        self.algo = algo

    def __call__(self, file_path):
        return self.hash_file(file_path)

    def hash_file(self, file_path):
        file_hash = new_hash(self.algo)
        one_mb = 1024 * 1024
        # Protect against zero or no block_size
        block_size = max(file_hash.block_size, 1)
        # Feed ~1 MiB at a time to the hash algo, as a multiple of block_size
        buf_size = block_size * (one_mb // block_size)
        try:
            with open(file_path, 'rb') as f:
                f.seek(0)
                while True:
                    buf = f.read(buf_size)
                    if buf:
                        file_hash.update(buf)
                    else:
                        break
        except IOError as e:
            print(e, file=stderr)
            return None, file_path

        return file_hash.hexdigest(), file_path


class FileHashes(object):
    def __init__(
        self, files=None, thread_count=None, use_real_procs=False,
        use_openssl=False, algo='sha1'
    ):
        self.files = files if files else []
        self.thread_count = thread_count if thread_count else max(cpu_count() // 2, 1)
        self.use_real_procs = use_real_procs
        self.use_openssl = use_openssl
        self.algo = algo

    def hashes(self):
        if self.use_real_procs:
            pool = Pool(self.thread_count)
        else:
            pool = ThreadPool(self.thread_count)

        # Pool.imap can't pickle functions with non-dummy subprocesses,
        # but it can pickle instances of classes.
        # That is why the hash function is implemented as callabled class instead
        # of class methods.
        hasher = HashFile(self.algo)

        for file_hash, file_path in pool.imap(hasher, self.files):
            if file_hash:
                yield file_hash, file_path


class CommandLine(object):
    def __call__(self):
        self.args = self._args()
        file_hashes = FileHashes(
            files=self.args.files,
            thread_count=self.args.thread_count,
            use_real_procs=self.args.use_real_procs,
            algo=self.args.algo,
        )

        for file_hash, file_path in file_hashes.hashes():
            print(file_path, '=', file_hash)

    def _args(self):
        parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)

        parser.add_argument('files', nargs='+')
        parser.add_argument(
            '-a', '--algorithm', dest='algo',
            default='sha1', choices=sorted(algorithms_available),
            help='Hash algorithm to use'
        )
        parser.add_argument(
            '-t', '--threads', dest='thread_count',
            type=int, default=max(cpu_count() // 2, 1),
            help='Number of threads or processes to use'
        )
        parser.add_argument(
            '-p', '--real-processes', dest='use_real_procs', action='store_true',
            help='Use full processes instead of threads'
        )

        args = parser.parse_args()
        return args


if __name__ == '__main__':
    cmd = CommandLine()
    cmd()
	#!/usr/bin/env python

	"""Hash files, like md5sum or shasum, only faster. Works with all
	hash functions Python's hashlib supports."""

	from __future__ import print_function
	from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
	from hashlib import algorithms_available
	from hashlib import new as new_hash
	from io import open
	from multiprocessing import Pool, cpu_count
	from multiprocessing.dummy import Pool as ThreadPool
	from sys import stderr


	class HashFile(object):
	def __init__(self, algo='sha1'):
	self.algo = algo

	def __call__(self, file_path):
	return self.hash_file(file_path)

	def hash_file(self, file_path):
	file_hash = new_hash(self.algo)
	one_mb = 1024 * 1024
	# Protect against zero or no block_size
	block_size = max(file_hash.block_size, 1)
	# Feed ~1 MiB at a time to the hash algo, as a multiple of block_size
	buf_size = block_size * (one_mb // block_size)
	try:
	with open(file_path, 'rb') as f:
	f.seek(0)
	while True:
	buf = f.read(buf_size)
	if buf:
	file_hash.update(buf)
	else:
	break
	except IOError as e:
	print(e, file=stderr)
	return None, file_path

	return file_hash.hexdigest(), file_path


	class FileHashes(object):
	def __init__(
	self, files=None, thread_count=None, use_real_procs=False,
	use_openssl=False, algo='sha1'
	):
	self.files = files if files else []
	self.thread_count = thread_count if thread_count else max(cpu_count() // 2, 1)
	self.use_real_procs = use_real_procs
	self.use_openssl = use_openssl
	self.algo = algo

	def hashes(self):
	if self.use_real_procs:
	pool = Pool(self.thread_count)
	else:
	pool = ThreadPool(self.thread_count)

	# Pool.imap can't pickle functions with non-dummy subprocesses,
	# but it can pickle instances of classes.
	# That is why the hash function is implemented as callabled class instead
	# of class methods.
	hasher = HashFile(self.algo)

	for file_hash, file_path in pool.imap(hasher, self.files):
	if file_hash:
	yield file_hash, file_path


	class CommandLine(object):
	def __call__(self):
	self.args = self._args()
	file_hashes = FileHashes(
	files=self.args.files,
	thread_count=self.args.thread_count,
	use_real_procs=self.args.use_real_procs,
	algo=self.args.algo,
	)

	for file_hash, file_path in file_hashes.hashes():
	print(file_path, '=', file_hash)

	def _args(self):
	parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)

	parser.add_argument('files', nargs='+')
	parser.add_argument(
	'-a', '--algorithm', dest='algo',
	default='sha1', choices=sorted(algorithms_available),
	help='Hash algorithm to use'
	)
	parser.add_argument(
	'-t', '--threads', dest='thread_count',
	type=int, default=max(cpu_count() // 2, 1),
	help='Number of threads or processes to use'
	)
	parser.add_argument(
	'-p', '--real-processes', dest='use_real_procs', action='store_true',
	help='Use full processes instead of threads'
	)

	args = parser.parse_args()
	return args


	if __name__ == '__main__':
	cmd = CommandLine()
	cmd()