Created
January 10, 2024 18:29
-
-
Save j-chimienti/ef9ebd674dffb7c9982205c2be2e13d6 to your computer and use it in GitHub Desktop.
delete duplicate files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import hashlib | |
import argparse | |
def file_hash(filepath): | |
"""This function returns the MD5 hash of the file passed into it""" | |
hasher = hashlib.md5() | |
with open(filepath, 'rb') as f: | |
buf = f.read(65536) | |
while len(buf) > 0: | |
hasher.update(buf) | |
buf = f.read(65536) | |
return hasher.hexdigest() | |
def find_duplicate_files(directory_path, max_depth): | |
"""This function returns a list of duplicate files from the given directory""" | |
sizes = {} | |
hashes = {} | |
duplicates = [] | |
root_level = directory_path.rstrip(os.path.sep).count(os.path.sep) | |
for dirpath, dirnames, filenames in os.walk(directory_path): | |
# Check the depth | |
depth = dirpath.count(os.path.sep) - root_level | |
if depth > max_depth: | |
# Skip directories beyond max depth | |
dirnames[:] = [] | |
continue | |
for filename in filenames: | |
file_path = os.path.join(dirpath, filename) | |
file_size = os.path.getsize(file_path) | |
if file_size in sizes: | |
sizes[file_size].append(file_path) | |
else: | |
sizes[file_size] = [file_path] | |
for size, paths in sizes.items(): | |
if len(paths) > 1: | |
for path in paths: | |
file_hash_value = file_hash(path) | |
if file_hash_value in hashes: | |
duplicates.append((hashes[file_hash_value], path)) | |
else: | |
hashes[file_hash_value] = path | |
return duplicates | |
def main(): | |
parser = argparse.ArgumentParser(description='Find and optionally delete duplicate files.') | |
parser.add_argument('directory', type=str, help='Directory path to search for duplicate files') | |
parser.add_argument('--max-depth', type=int, default=1, help='Maximum depth to traverse in directory (default: 1)') | |
parser.add_argument('--delete', dest='delete', action='store_true', | |
help='Delete duplicate files (default is a dry run without deletion)') | |
args = parser.parse_args() | |
if not os.path.isdir(args.directory): | |
print("Error: The provided directory does not exist.") | |
return | |
duplicates = find_duplicate_files(args.directory, args.max_depth) | |
if duplicates: | |
print("Found duplicate files:") | |
for original, duplicate in duplicates: | |
print(f"Original: {original}\nDuplicate: {duplicate}\n") | |
if args.delete: | |
os.remove(duplicate) | |
print(f"Deleted: {duplicate}\n") | |
else: | |
print("No duplicate files found.") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment