-
-
Save ntjess/1663d25d09bd762af2f0c60f600191f5 to your computer and use it in GitHub Desktop.
Fast duplicate file finder written in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Fast duplicate file finder. | |
Usage: duplicates.py <folder> [<folder>...] | |
Based on https://stackoverflow.com/a/36113168/300783 | |
Modified for Python3 with some small code improvements. | |
""" | |
import os | |
import sys | |
import hashlib | |
from collections import defaultdict | |
def chunk_reader(fobj, chunk_size=1024): | |
""" Generator that reads a file in chunks of bytes """ | |
while True: | |
chunk = fobj.read(chunk_size) | |
if not chunk: | |
return | |
yield chunk | |
def get_hash(filename, first_chunk_only=False, hash_algo=hashlib.sha1): | |
hashobj = hash_algo() | |
with open(filename, "rb") as f: | |
if first_chunk_only: | |
hashobj.update(f.read(1024)) | |
else: | |
for chunk in chunk_reader(f): | |
hashobj.update(chunk) | |
return hashobj.digest() | |
def check_for_duplicates(paths): | |
files_by_size = defaultdict(list) | |
files_by_small_hash = defaultdict(list) | |
files_by_full_hash = defaultdict(list) | |
for path in paths: | |
for dirpath, _, filenames in os.walk(path): | |
for filename in filenames: | |
full_path = os.path.join(dirpath, filename) | |
try: | |
# if the target is a symlink (soft one), this will | |
# dereference it - change the value to the actual target file | |
full_path = os.path.realpath(full_path) | |
file_size = os.path.getsize(full_path) | |
except OSError: | |
# not accessible (permissions, etc) - pass on | |
continue | |
files_by_size[file_size].append(full_path) | |
# For all files with the same file size, get their hash on the first 1024 bytes | |
for file_size, files in files_by_size.items(): | |
if len(files) < 2: | |
continue # this file size is unique, no need to spend cpu cycles on it | |
for filename in files: | |
try: | |
small_hash = get_hash(filename, first_chunk_only=True) | |
except OSError: | |
# the file access might've changed till the exec point got here | |
continue | |
files_by_small_hash[(file_size, small_hash)].append(filename) | |
# For all files with the hash on the first 1024 bytes, get their hash on the full | |
# file - collisions will be duplicates | |
for files in files_by_small_hash.values(): | |
if len(files) < 2: | |
# the hash of the first 1k bytes is unique -> skip this file | |
continue | |
for filename in files: | |
try: | |
full_hash = get_hash(filename, first_chunk_only=False) | |
except OSError: | |
# the file access might've changed till the exec point got here | |
continue | |
# Add this file to the list of others sharing the same full hash | |
files_by_full_hash[full_hash].append(filename) | |
# Now, print a summary of all files that share a full hash | |
for file_list in files_by_full_hash.values(): | |
if len(file_list) < 2: | |
# Only one file, it's unique | |
continue | |
else: | |
# More than one file share the same full hash | |
# Turn [filea, fileb, filec] into | |
# - filea | |
# - fileb | |
# - filec | |
files_str = "\n".join("- %s" % file for file in file_list) | |
print("Duplicate found:\n%s\n" % files_str) | |
if __name__ == "__main__": | |
if sys.argv[1:]: | |
check_for_duplicates(sys.argv[1:]) | |
else: | |
print("Usage: %s <folder> [<folder>...]" % sys.argv[0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Like your improvements on the original script. Way cleaner.
I did a few minor additions to fit my purpose.