Skip to content

Instantly share code, notes, and snippets.

@philmae
Forked from ntjess/duplicates.py
Last active November 11, 2022 22:40
Show Gist options
  • Save philmae/277e5eab0d42cf9090c43d61bd732d52 to your computer and use it in GitHub Desktop.
Save philmae/277e5eab0d42cf9090c43d61bd732d52 to your computer and use it in GitHub Desktop.
Fast duplicate file finder written in python
#!/usr/bin/env python3
"""
Fast duplicate file finder.
Usage: duplicates.py <folder> [<folder>...]
Based on https://stackoverflow.com/a/36113168/300783
Modified for Python3 with some small code improvements.
## Problem Statement
File System with 100's of Terabytes data stored, mix of data types and file sizes, were duplicated occure very often. Script needs to have fast runtime to efficiently crawl file storage, output the duplicates as well as file sizes.
## Description
this script is based on https://stackoverflow.com/a/36113168/300783
with additions included from https://gist.github.com/ntjess/1663d25d09bd762af2f0c60f600191f5
with further code improvements from myself
The Solution does an iterative approach to the file scan check
* Hash table of files, where key comparison is done on file size alone - If file size match -> into hash table
* Hash table of (same size) files, where key comparison is done on hash of their first 1024 bytes; non-colliding elements are unique -> into hash table
* Hash table of files with with same first 1k bytes, where key comparison is done on full hash; files with matching ones are NOT unique -> into hash table
"""
import os
import sys
import hashlib
from collections import defaultdict
def chunk_reader(fobj, chunk_size=1024):
""" Generator that reads a file in chunks of bytes """
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def get_hash(filename, first_chunk_only=False, hash_algo=hashlib.sha1):
hashobj = hash_algo()
with open(filename, "rb") as f:
if first_chunk_only:
hashobj.update(f.read(1024))
else:
for chunk in chunk_reader(f):
hashobj.update(chunk)
return hashobj.digest()
#Converts bytes to readable output
def convert_bytes(num):
for unit in ['bytes', 'KB', 'MB', 'GB', 'TB','PB']:
if abs(num) < 1024.0:
return "%3.1f %s" % (num, unit)
num /= 1024.0
#Returns File Size
def file_size_check(file_path):
if os.path.isfile(file_path):
file_info = os.stat(file_path)
return convert_bytes(file_info.st_size)
def check_for_duplicates(paths):
files_by_size = defaultdict(list)
files_by_small_hash = defaultdict(list)
files_by_full_hash = defaultdict(list)
for path in paths:
for dirpath, _, filenames in os.walk(path):
#Shows which folders scanned
print('Scanning %s...' % dirpath)
for filename in filenames:
full_path = os.path.join(dirpath, filename)
try:
# if the target is a symlink (soft one), this will
# dereference it - change the value to the actual target file
full_path = os.path.realpath(full_path)
file_size = os.path.getsize(full_path)
except OSError:
# not accessible (permissions, etc) - pass on
continue
files_by_size[file_size].append(full_path)
c = 0
# For all files with the same file size, get their hash on the first 1024 bytes
for file_size, files in files_by_size.items():
if len(files) < 2:
c += 1
continue # this file size is unique, no need to spend cpu cycles on it
for filename in files:
try:
small_hash = get_hash(filename, first_chunk_only=True)
except OSError:
# the file access might've changed till the exec point got here
continue
files_by_small_hash[(file_size, small_hash)].append(filename)
if len(files_by_size.items()) == c:
print('No duplicate files found.')
else:
print('\n --- Duplicates detected --- \n')
# For all files with the hash on the first 1024 bytes, get their hash on the full
# file - collisions will be duplicates
for files in files_by_small_hash.values():
if len(files) < 2:
# the hash of the first 1k bytes is unique -> skip this file
continue
for filename in files:
try:
full_hash = get_hash(filename, first_chunk_only=False)
except OSError:
# the file access might've changed till the exec point got here
continue
# Add this file to the list of others sharing the same full hash
files_by_full_hash[full_hash].append(filename)
# Now, print a summary of all files that share a full hash
for file_list in files_by_full_hash.values():
if len(file_list) < 2:
# Only one file, it's unique
continue
else:
# More than one file share the same full hash
# Turn [filea, fileb, filec] into
# - filea
# - fileb
# - filec
files_str = "\n".join("- %s" % file for file in file_list)
print('Duplicates:')
#Also Show File Size to check if its worth it
for result in file_list[:1]:
print("Size of file :", file_size_check(result))
print('___________________')
print(" \n%s\n" % files_str)
print('File Scan Complete')
if __name__ == "__main__":
if sys.argv[1:]:
check_for_duplicates(sys.argv[1:])
else:
print("Usage: %s <folder> [<folder>...]" % sys.argv[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment