philmae/duplicates.py

## duplicates.py
#!/usr/bin/env python3
"""
Fast duplicate file finder.
Usage: duplicates.py <folder> [<folder>...]

Based on https://stackoverflow.com/a/36113168/300783
Modified for Python3 with some small code improvements.

## Problem Statement

File System with 100's of Terabytes data stored, mix of data types and file sizes, were duplicated occure very often. Script needs to have fast runtime to efficiently crawl file storage, output the duplicates as well as file sizes.

## Description

this script is based on https://stackoverflow.com/a/36113168/300783
with additions included from https://gist.github.com/ntjess/1663d25d09bd762af2f0c60f600191f5
with further code improvements from myself

The Solution does an iterative approach to the file scan check
* Hash table of files, where key comparison is done on file size alone - If file size match -> into hash table
* Hash table of (same size) files, where key comparison is done on hash of their first 1024 bytes; non-colliding elements are unique -> into hash table
* Hash table of files with with same first 1k bytes, where key comparison is done on full hash; files with matching ones are NOT unique -> into hash table

"""
import os
import sys
import hashlib
from collections import defaultdict

def chunk_reader(fobj, chunk_size=1024):
    """ Generator that reads a file in chunks of bytes """
    while True:
        chunk = fobj.read(chunk_size)
        if not chunk:
            return
        yield chunk


def get_hash(filename, first_chunk_only=False, hash_algo=hashlib.sha1):
    hashobj = hash_algo()
    with open(filename, "rb") as f:
        if first_chunk_only:
            hashobj.update(f.read(1024))
        else:
            for chunk in chunk_reader(f):
                hashobj.update(chunk)
    return hashobj.digest()

#Converts bytes to readable output
def convert_bytes(num):
    for unit in ['bytes', 'KB', 'MB', 'GB', 'TB','PB']:
        if abs(num) < 1024.0:
            return "%3.1f %s" % (num, unit)
        num /= 1024.0

#Returns File Size
def file_size_check(file_path):
    if os.path.isfile(file_path):
        file_info = os.stat(file_path)
        return convert_bytes(file_info.st_size)

def check_for_duplicates(paths):
    files_by_size = defaultdict(list)
    files_by_small_hash = defaultdict(list)
    files_by_full_hash = defaultdict(list)

    for path in paths:
        for dirpath, _, filenames in os.walk(path):
            #Shows which folders scanned
            print('Scanning %s...' % dirpath)
            for filename in filenames:
                full_path = os.path.join(dirpath, filename)
                try:
                    # if the target is a symlink (soft one), this will
                    # dereference it - change the value to the actual target file
                    full_path = os.path.realpath(full_path)
                    file_size = os.path.getsize(full_path)
                except OSError:
                    # not accessible (permissions, etc) - pass on
                    continue
                files_by_size[file_size].append(full_path)
    c = 0
    # For all files with the same file size, get their hash on the first 1024 bytes
    for file_size, files in files_by_size.items():
        if len(files) < 2:
            c += 1
            continue  # this file size is unique, no need to spend cpu cycles on it

        for filename in files:
            try:
                small_hash = get_hash(filename, first_chunk_only=True)
            except OSError:
                # the file access might've changed till the exec point got here
                continue
            files_by_small_hash[(file_size, small_hash)].append(filename)

    if len(files_by_size.items()) == c:
        print('No duplicate files found.')
    else:
        print('\n --- Duplicates detected --- \n')

    # For all files with the hash on the first 1024 bytes, get their hash on the full
    # file - collisions will be duplicates
    for files in files_by_small_hash.values():
        if len(files) < 2:
            # the hash of the first 1k bytes is unique -> skip this file
            continue

        for filename in files:
            try:
                full_hash = get_hash(filename, first_chunk_only=False)
            except OSError:
                # the file access might've changed till the exec point got here
                continue

            # Add this file to the list of others sharing the same full hash
            files_by_full_hash[full_hash].append(filename)

    # Now, print a summary of all files that share a full hash
    for file_list in files_by_full_hash.values():
        if len(file_list) < 2:
            # Only one file, it's unique
            continue
        else:
            # More than one file share the same full hash
            # Turn [filea, fileb, filec] into
            # - filea
            # - fileb
            # - filec
            files_str = "\n".join("- %s" % file for file in file_list)
            print('Duplicates:')
            #Also Show File Size to check if its worth it
            for result in file_list[:1]:
                print("Size of file :", file_size_check(result))
            print('___________________')
            print(" \n%s\n" % files_str)
    print('File Scan Complete')

if __name__ == "__main__":
    if sys.argv[1:]:
        check_for_duplicates(sys.argv[1:])
    else:
        print("Usage: %s <folder> [<folder>...]" % sys.argv[0])
	#!/usr/bin/env python3
	"""
	Fast duplicate file finder.
	Usage: duplicates.py <folder> [<folder>...]

	Based on https://stackoverflow.com/a/36113168/300783
	Modified for Python3 with some small code improvements.

	## Problem Statement

	File System with 100's of Terabytes data stored, mix of data types and file sizes, were duplicated occure very often. Script needs to have fast runtime to efficiently crawl file storage, output the duplicates as well as file sizes.

	## Description

	this script is based on https://stackoverflow.com/a/36113168/300783
	with additions included from https://gist.github.com/ntjess/1663d25d09bd762af2f0c60f600191f5
	with further code improvements from myself

	The Solution does an iterative approach to the file scan check
	* Hash table of files, where key comparison is done on file size alone - If file size match -> into hash table
	* Hash table of (same size) files, where key comparison is done on hash of their first 1024 bytes; non-colliding elements are unique -> into hash table
	* Hash table of files with with same first 1k bytes, where key comparison is done on full hash; files with matching ones are NOT unique -> into hash table

	"""
	import os
	import sys
	import hashlib
	from collections import defaultdict

	def chunk_reader(fobj, chunk_size=1024):
	""" Generator that reads a file in chunks of bytes """
	while True:
	chunk = fobj.read(chunk_size)
	if not chunk:
	return
	yield chunk


	def get_hash(filename, first_chunk_only=False, hash_algo=hashlib.sha1):
	hashobj = hash_algo()
	with open(filename, "rb") as f:
	if first_chunk_only:
	hashobj.update(f.read(1024))
	else:
	for chunk in chunk_reader(f):
	hashobj.update(chunk)
	return hashobj.digest()

	#Converts bytes to readable output
	def convert_bytes(num):
	for unit in ['bytes', 'KB', 'MB', 'GB', 'TB','PB']:
	if abs(num) < 1024.0:
	return "%3.1f %s" % (num, unit)
	num /= 1024.0

	#Returns File Size
	def file_size_check(file_path):
	if os.path.isfile(file_path):
	file_info = os.stat(file_path)
	return convert_bytes(file_info.st_size)

	def check_for_duplicates(paths):
	files_by_size = defaultdict(list)
	files_by_small_hash = defaultdict(list)
	files_by_full_hash = defaultdict(list)

	for path in paths:
	for dirpath, _, filenames in os.walk(path):
	#Shows which folders scanned
	print('Scanning %s...' % dirpath)
	for filename in filenames:
	full_path = os.path.join(dirpath, filename)
	try:
	# if the target is a symlink (soft one), this will
	# dereference it - change the value to the actual target file
	full_path = os.path.realpath(full_path)
	file_size = os.path.getsize(full_path)
	except OSError:
	# not accessible (permissions, etc) - pass on
	continue
	files_by_size[file_size].append(full_path)
	c = 0
	# For all files with the same file size, get their hash on the first 1024 bytes
	for file_size, files in files_by_size.items():
	if len(files) < 2:
	c += 1
	continue # this file size is unique, no need to spend cpu cycles on it

	for filename in files:
	try:
	small_hash = get_hash(filename, first_chunk_only=True)
	except OSError:
	# the file access might've changed till the exec point got here
	continue
	files_by_small_hash[(file_size, small_hash)].append(filename)

	if len(files_by_size.items()) == c:
	print('No duplicate files found.')
	else:
	print('\n --- Duplicates detected --- \n')

	# For all files with the hash on the first 1024 bytes, get their hash on the full
	# file - collisions will be duplicates
	for files in files_by_small_hash.values():
	if len(files) < 2:
	# the hash of the first 1k bytes is unique -> skip this file
	continue

	for filename in files:
	try:
	full_hash = get_hash(filename, first_chunk_only=False)
	except OSError:
	# the file access might've changed till the exec point got here
	continue

	# Add this file to the list of others sharing the same full hash
	files_by_full_hash[full_hash].append(filename)

	# Now, print a summary of all files that share a full hash
	for file_list in files_by_full_hash.values():
	if len(file_list) < 2:
	# Only one file, it's unique
	continue
	else:
	# More than one file share the same full hash
	# Turn [filea, fileb, filec] into
	# - filea
	# - fileb
	# - filec
	files_str = "\n".join("- %s" % file for file in file_list)
	print('Duplicates:')
	#Also Show File Size to check if its worth it
	for result in file_list[:1]:
	print("Size of file :", file_size_check(result))
	print('___________________')
	print(" \n%s\n" % files_str)
	print('File Scan Complete')

	if __name__ == "__main__":
	if sys.argv[1:]:
	check_for_duplicates(sys.argv[1:])
	else:
	print("Usage: %s <folder> [<folder>...]" % sys.argv[0])