Skip to content

Instantly share code, notes, and snippets.

@tfeldmann
Last active August 1, 2024 07:45
Show Gist options
  • Save tfeldmann/fc875e6630d11f2256e746f67a09c1ae to your computer and use it in GitHub Desktop.
Save tfeldmann/fc875e6630d11f2256e746f67a09c1ae to your computer and use it in GitHub Desktop.
Fast duplicate file finder written in python
#!/usr/bin/env python
"""
Fast duplicate file finder.
Usage: duplicates.py <folder> [<folder>...]
Based on https://stackoverflow.com/a/36113168/300783
Modified for Python3 with some small code improvements.
"""
import os
import sys
import hashlib
from collections import defaultdict
def chunk_reader(fobj, chunk_size=1024):
""" Generator that reads a file in chunks of bytes """
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def get_hash(filename, first_chunk_only=False, hash_algo=hashlib.sha1):
hashobj = hash_algo()
with open(filename, "rb") as f:
if first_chunk_only:
hashobj.update(f.read(1024))
else:
for chunk in chunk_reader(f):
hashobj.update(chunk)
return hashobj.digest()
def check_for_duplicates(paths):
files_by_size = defaultdict(list)
files_by_small_hash = defaultdict(list)
files_by_full_hash = dict()
for path in paths:
for dirpath, _, filenames in os.walk(path):
for filename in filenames:
full_path = os.path.join(dirpath, filename)
try:
# if the target is a symlink (soft one), this will
# dereference it - change the value to the actual target file
full_path = os.path.realpath(full_path)
file_size = os.path.getsize(full_path)
except OSError:
# not accessible (permissions, etc) - pass on
continue
files_by_size[file_size].append(full_path)
# For all files with the same file size, get their hash on the first 1024 bytes
for file_size, files in files_by_size.items():
if len(files) < 2:
continue # this file size is unique, no need to spend cpu cycles on it
for filename in files:
try:
small_hash = get_hash(filename, first_chunk_only=True)
except OSError:
# the file access might've changed till the exec point got here
continue
files_by_small_hash[(file_size, small_hash)].append(filename)
# For all files with the hash on the first 1024 bytes, get their hash on the full
# file - collisions will be duplicates
for files in files_by_small_hash.values():
if len(files) < 2:
# the hash of the first 1k bytes is unique -> skip this file
continue
for filename in files:
try:
full_hash = get_hash(filename, first_chunk_only=False)
except OSError:
# the file access might've changed till the exec point got here
continue
if full_hash in files_by_full_hash:
duplicate = files_by_full_hash[full_hash]
print("Duplicate found:\n - %s\n - %s\n" % (filename, duplicate))
else:
files_by_full_hash[full_hash] = filename
if __name__ == "__main__":
if sys.argv[1:]:
check_for_duplicates(sys.argv[1:])
else:
print("Usage: %s <folder> [<folder>...]" % sys.argv[0])
@jcjveraa
Copy link

Note, when I say that hardlinking can be undone this is "... by an expert user and then still only to a certain extent". Noting comes for free, and what you will lose/need to restore via other means (if required) is file metadata including file access permissions. If like in my case this is not an issue, then my method works, but your mileage may vary :-)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment