voyeg3r/finduplicated.py

## finduplicated.py
import sys
import os
import hashlib

# source: http://stackoverflow.com/questions/748675/finding-duplicate-files-and-removing-them
# This version uses the file size and a hash of the contents to find duplicates. You can pass
# it multiple paths, it will scan all paths recursively and report all duplicates found.

def chunk_reader(fobj, chunk_size=1024):
    """Generator that reads a file in chunks of bytes"""
    while True:
        chunk = fobj.read(chunk_size)
        if not chunk:
            return
        yield chunk

def check_for_duplicates(paths, hash=hashlib.sha1):
    hashes = {}
    for path in paths:
        for dirpath, dirnames, filenames in os.walk(path):
            for filename in filenames:
                full_path = os.path.join(dirpath, filename)
                hashobj = hash()
                for chunk in chunk_reader(open(full_path, 'rb')):
                    hashobj.update(chunk)
                file_id = (hashobj.digest(), os.path.getsize(full_path))
                duplicate = hashes.get(file_id, None)
                if duplicate:
                    print "Duplicate found: %s and %s" % (full_path, duplicate)
                else:
                    hashes[file_id] = full_path

if sys.argv[1:]:
    check_for_duplicates(sys.argv[1:])
else:
    print "Please pass the paths to check as parameters to the script"
	import sys
	import os
	import hashlib

	# source: http://stackoverflow.com/questions/748675/finding-duplicate-files-and-removing-them
	# This version uses the file size and a hash of the contents to find duplicates. You can pass
	# it multiple paths, it will scan all paths recursively and report all duplicates found.

	def chunk_reader(fobj, chunk_size=1024):
	"""Generator that reads a file in chunks of bytes"""
	while True:
	chunk = fobj.read(chunk_size)
	if not chunk:
	return
	yield chunk

	def check_for_duplicates(paths, hash=hashlib.sha1):
	hashes = {}
	for path in paths:
	for dirpath, dirnames, filenames in os.walk(path):
	for filename in filenames:
	full_path = os.path.join(dirpath, filename)
	hashobj = hash()
	for chunk in chunk_reader(open(full_path, 'rb')):
	hashobj.update(chunk)
	file_id = (hashobj.digest(), os.path.getsize(full_path))
	duplicate = hashes.get(file_id, None)
	if duplicate:
	print "Duplicate found: %s and %s" % (full_path, duplicate)
	else:
	hashes[file_id] = full_path

	if sys.argv[1:]:
	check_for_duplicates(sys.argv[1:])
	else:
	print "Please pass the paths to check as parameters to the script"