APadierna/find_duplicates.py

## find_duplicates.py
#!/usr/bin/env python

"""
Script to crawl into a directory and dettect (by hash) duplicated files and (optionally)
remove them

Kudos to http://stackoverflow.com/a/748908
"""

import argparse
import hashlib
import os
import sys


def main():
    parser = argparse.ArgumentParser(description='Recursivelly seek for duplicated files.')

    parser.add_argument('--folder',
                        default='.',
                        help='Base search folder')

    parser.add_argument('--purge',
                        action='store_const',
                        const=True,
                        default=False,
                        help='Remove duplicated files')

    args = parser.parse_args()
    check_for_duplicates(args.folder, purge=args.purge)


def check_for_duplicates(paths, hash=hashlib.sha1, purge=False):
    hashes = {}
    for path in paths:
        for dirpath, dirnames, filenames in os.walk(path):
            for filename in filenames:
                full_path = os.path.join(dirpath, filename)
                hashobj = hash()
                for chunk in chunk_reader(open(full_path, 'rb')):
                    hashobj.update(chunk)
                file_id = (hashobj.digest(), os.path.getsize(full_path))
                duplicate = hashes.get(file_id, None)
                if duplicate:
                    print("Duplicate found: \n\t%s and \n\t%s" % (full_path, duplicate))
                    if purge:
                        print("Removing duplicated file: \n\t%s" % (duplicate))
                        os.remove(duplicate)
                else:
                    hashes[file_id] = full_path


def chunk_reader(fobj, chunk_size=1024):
    """Generator that reads a file in chunks of bytes"""
    while True:
        chunk = fobj.read(chunk_size)
        if not chunk:
            return
        yield chunk


if __name__ == '__main__':
    main()
	#!/usr/bin/env python

	"""
	Script to crawl into a directory and dettect (by hash) duplicated files and (optionally)
	remove them

	Kudos to http://stackoverflow.com/a/748908
	"""

	import argparse
	import hashlib
	import os
	import sys


	def main():
	parser = argparse.ArgumentParser(description='Recursivelly seek for duplicated files.')

	parser.add_argument('--folder',
	default='.',
	help='Base search folder')

	parser.add_argument('--purge',
	action='store_const',
	const=True,
	default=False,
	help='Remove duplicated files')

	args = parser.parse_args()
	check_for_duplicates(args.folder, purge=args.purge)


	def check_for_duplicates(paths, hash=hashlib.sha1, purge=False):
	hashes = {}
	for path in paths:
	for dirpath, dirnames, filenames in os.walk(path):
	for filename in filenames:
	full_path = os.path.join(dirpath, filename)
	hashobj = hash()
	for chunk in chunk_reader(open(full_path, 'rb')):
	hashobj.update(chunk)
	file_id = (hashobj.digest(), os.path.getsize(full_path))
	duplicate = hashes.get(file_id, None)
	if duplicate:
	print("Duplicate found: \n\t%s and \n\t%s" % (full_path, duplicate))
	if purge:
	print("Removing duplicated file: \n\t%s" % (duplicate))
	os.remove(duplicate)
	else:
	hashes[file_id] = full_path


	def chunk_reader(fobj, chunk_size=1024):
	"""Generator that reads a file in chunks of bytes"""
	while True:
	chunk = fobj.read(chunk_size)
	if not chunk:
	return
	yield chunk


	if __name__ == '__main__':
	main()