Skip to content

Instantly share code, notes, and snippets.

@ntjess
Forked from tfeldmann/duplicates.py
Last active April 16, 2023 03:11
Show Gist options
  • Save ntjess/1663d25d09bd762af2f0c60f600191f5 to your computer and use it in GitHub Desktop.
Save ntjess/1663d25d09bd762af2f0c60f600191f5 to your computer and use it in GitHub Desktop.
Fast duplicate file finder written in python
#!/usr/bin/env python
"""
Fast duplicate file finder.
Usage: duplicates.py <folder> [<folder>...]
Based on https://stackoverflow.com/a/36113168/300783
Modified for Python3 with some small code improvements.
"""
import os
import sys
import hashlib
from collections import defaultdict
def chunk_reader(fobj, chunk_size=1024):
""" Generator that reads a file in chunks of bytes """
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def get_hash(filename, first_chunk_only=False, hash_algo=hashlib.sha1):
hashobj = hash_algo()
with open(filename, "rb") as f:
if first_chunk_only:
hashobj.update(f.read(1024))
else:
for chunk in chunk_reader(f):
hashobj.update(chunk)
return hashobj.digest()
def check_for_duplicates(paths):
files_by_size = defaultdict(list)
files_by_small_hash = defaultdict(list)
files_by_full_hash = defaultdict(list)
for path in paths:
for dirpath, _, filenames in os.walk(path):
for filename in filenames:
full_path = os.path.join(dirpath, filename)
try:
# if the target is a symlink (soft one), this will
# dereference it - change the value to the actual target file
full_path = os.path.realpath(full_path)
file_size = os.path.getsize(full_path)
except OSError:
# not accessible (permissions, etc) - pass on
continue
files_by_size[file_size].append(full_path)
# For all files with the same file size, get their hash on the first 1024 bytes
for file_size, files in files_by_size.items():
if len(files) < 2:
continue # this file size is unique, no need to spend cpu cycles on it
for filename in files:
try:
small_hash = get_hash(filename, first_chunk_only=True)
except OSError:
# the file access might've changed till the exec point got here
continue
files_by_small_hash[(file_size, small_hash)].append(filename)
# For all files with the hash on the first 1024 bytes, get their hash on the full
# file - collisions will be duplicates
for files in files_by_small_hash.values():
if len(files) < 2:
# the hash of the first 1k bytes is unique -> skip this file
continue
for filename in files:
try:
full_hash = get_hash(filename, first_chunk_only=False)
except OSError:
# the file access might've changed till the exec point got here
continue
# Add this file to the list of others sharing the same full hash
files_by_full_hash[full_hash].append(filename)
# Now, print a summary of all files that share a full hash
for file_list in files_by_full_hash.values():
if len(file_list) < 2:
# Only one file, it's unique
continue
else:
# More than one file share the same full hash
# Turn [filea, fileb, filec] into
# - filea
# - fileb
# - filec
files_str = "\n".join("- %s" % file for file in file_list)
print("Duplicate found:\n%s\n" % files_str)
if __name__ == "__main__":
if sys.argv[1:]:
check_for_duplicates(sys.argv[1:])
else:
print("Usage: %s <folder> [<folder>...]" % sys.argv[0])
@philmae
Copy link

philmae commented Nov 11, 2022

Like your improvements on the original script. Way cleaner.
I did a few minor additions to fit my purpose.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment