Skip to content

Instantly share code, notes, and snippets.

@j-chimienti
Created January 10, 2024 18:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save j-chimienti/ef9ebd674dffb7c9982205c2be2e13d6 to your computer and use it in GitHub Desktop.
Save j-chimienti/ef9ebd674dffb7c9982205c2be2e13d6 to your computer and use it in GitHub Desktop.
delete duplicate files
import os
import hashlib
import argparse
def file_hash(filepath):
"""This function returns the MD5 hash of the file passed into it"""
hasher = hashlib.md5()
with open(filepath, 'rb') as f:
buf = f.read(65536)
while len(buf) > 0:
hasher.update(buf)
buf = f.read(65536)
return hasher.hexdigest()
def find_duplicate_files(directory_path, max_depth):
"""This function returns a list of duplicate files from the given directory"""
sizes = {}
hashes = {}
duplicates = []
root_level = directory_path.rstrip(os.path.sep).count(os.path.sep)
for dirpath, dirnames, filenames in os.walk(directory_path):
# Check the depth
depth = dirpath.count(os.path.sep) - root_level
if depth > max_depth:
# Skip directories beyond max depth
dirnames[:] = []
continue
for filename in filenames:
file_path = os.path.join(dirpath, filename)
file_size = os.path.getsize(file_path)
if file_size in sizes:
sizes[file_size].append(file_path)
else:
sizes[file_size] = [file_path]
for size, paths in sizes.items():
if len(paths) > 1:
for path in paths:
file_hash_value = file_hash(path)
if file_hash_value in hashes:
duplicates.append((hashes[file_hash_value], path))
else:
hashes[file_hash_value] = path
return duplicates
def main():
parser = argparse.ArgumentParser(description='Find and optionally delete duplicate files.')
parser.add_argument('directory', type=str, help='Directory path to search for duplicate files')
parser.add_argument('--max-depth', type=int, default=1, help='Maximum depth to traverse in directory (default: 1)')
parser.add_argument('--delete', dest='delete', action='store_true',
help='Delete duplicate files (default is a dry run without deletion)')
args = parser.parse_args()
if not os.path.isdir(args.directory):
print("Error: The provided directory does not exist.")
return
duplicates = find_duplicate_files(args.directory, args.max_depth)
if duplicates:
print("Found duplicate files:")
for original, duplicate in duplicates:
print(f"Original: {original}\nDuplicate: {duplicate}\n")
if args.delete:
os.remove(duplicate)
print(f"Deleted: {duplicate}\n")
else:
print("No duplicate files found.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment