Skip to content

Instantly share code, notes, and snippets.

@prog893
Last active July 12, 2023 19:18
Show Gist options
  • Save prog893/30407cfe017af4107d2fa531567bb5df to your computer and use it in GitHub Desktop.
Save prog893/30407cfe017af4107d2fa531567bb5df to your computer and use it in GitHub Desktop.
Duplicate file cleaner

Duplicate file cleaner

This Python script helps you to organize your files in a unique and efficient way. It performs three main tasks:

  1. (optional, off by default) Rename Files: The script helps you organize your files by slightly adjusting the way your folders are named. Imagine you have a file in a subfolder like this: X/A/B/file.txt. The script will change this to: X A/B/file.txt. This means that the deepest level of your folder structure is brought up one level. This can make it easier to see the overall structure of your files and folders at a glance.

  2. Delete Duplicate Files: The script identifies duplicate files based on their SHA-256 hashes. For each group of duplicates, it asks the user to choose which file to keep and deletes the rest. The user is presented with an alphabetically sorted list of file paths and can either enter the number of the file to keep or press enter to choose the shortest file path by default.

  3. Delete Empty Directories: After moving and deleting files, some directories might become empty. The script deletes these directories to keep your file system clean.

The script also includes a preview mode, which shows what changes would be made but doesn't actually make them. This allows you to see the effects of the script without committing to them.

Usage

You can run the script from the command line with the path to the directory you want to organize as an argument:

python prune_dupes.py ./

The script will first run in preview mode. It will print the changes it would make and ask you to confirm before making them. If you're happy with the changes, enter yes when prompted to apply them. If you're not happy with the changes, enter no to exit the script without making changes.

Requirements

The script requires Python 3.6 or later. It doesn't require any external libraries.

import argparse
import hashlib
import os
import sys
from typing import List
def calculate_file_hash(file_path: str, algorithm: str = 'sha256') -> str:
"""
Calculate the hash of a file.
Args:
file_path (str): The path to the file.
algorithm (str, optional): The hash algorithm to use. Defaults to 'sha256'.
Returns:
str: The hash of the file.
"""
hash_func = hashlib.new(algorithm)
with open(file_path, 'rb') as f:
while True:
data = f.read(8192)
if not data:
break
hash_func.update(data)
return hash_func.hexdigest()
def replace_last_substring(s: str, old: str, new: str) -> str:
"""
Replace the last occurrence of a substring in a string.
Args:
s (str): The original string.
old (str): The substring to replace.
new (str): The string to replace the substring with.
Returns:
str: The string with the last occurrence of the substring replaced.
"""
# rsplit splits the string into a list, starting from the right
# The second argument '1' tells it to split only once
parts = s.rsplit(old, 1)
# join the parts back together with the new character
return new.join(parts)
def rename_files_in_directory(target_path: str, preview: bool = False) -> None:
"""
Rename files in a directory and its subdirectories.
Args:
target_path (str): The path to the directory.
preview (bool, optional): Whether to only preview the changes. Defaults to False.
"""
for root, dirs, files in os.walk(target_path):
for file in files:
old_path = os.path.join(root, file)
if file.startswith('.') or not os.path.isfile(old_path):
continue
new_root = replace_last_substring(root, "/", " ")
new_path = os.path.join(new_root, file)
if new_path.startswith('. /'):
# file is in root so rename would break path
continue
if preview:
print(f"Preview: Rename {old_path} to {new_path}")
else:
try:
os.makedirs(new_root, exist_ok=True) # create the new directory if it doesn't exist
os.rename(old_path, new_path)
print(f"Renamed {old_path} to {new_path}")
except OSError as e:
print(f"Error renaming {old_path} to {new_path}: {e}")
def delete_empty_directories(target_path: str, preview: bool = False) -> None:
"""
Delete empty directories in a directory and its subdirectories.
Args:
target_path (str): The path to the directory.
preview (bool, optional): Whether to only preview the changes. Defaults to False.
"""
for root, dirs, files in os.walk(target_path, topdown=False):
for directory in dirs:
dir_path = os.path.join(root, directory)
if not os.listdir(dir_path) or (len(os.listdir(dir_path)) == 1 and '.DS_Store' in os.listdir(dir_path)):
if preview:
print(f"Preview: Delete empty directory: {dir_path}")
else:
try:
# If .DS_Store file exists, delete it
ds_store_path = os.path.join(dir_path, '.DS_Store')
if os.path.exists(ds_store_path):
os.remove(ds_store_path)
print(f"Deleted: {ds_store_path}")
os.rmdir(dir_path)
print(f"Deleted empty directory: {dir_path}")
except OSError as e:
print(f"Error deleting {dir_path}: {e}")
def delete_duplicate_files(target_path: str, preview: bool = False) -> None:
"""
Find and delete duplicate files in a directory and its subdirectories.
Args:
target_path (str): The path to the directory.
preview (bool, optional): Whether to only preview the changes. Defaults to False.
"""
unique_file_hashes = {}
duplicate_lists = {}
for path, names, filenames in os.walk(target_path):
for filename in filenames:
file_path = os.path.join(path, filename)
file_hash = calculate_file_hash(file_path)
if file_hash in unique_file_hashes:
# already saw this hash, adding the newfound path as a potential duplicate
if file_hash in duplicate_lists:
duplicate_lists[file_hash].append(file_path)
# since key exists here, original is already in the list
else:
duplicate_lists[file_hash] = [file_path]
# however, here we need to also add first file with that hash as a candidate for removal
# we will let user choose one later
duplicate_lists[file_hash].append(unique_file_hashes[file_hash])
else:
# encountering this hash for the first time
unique_file_hashes[file_hash] = file_path
for _, duplicates in duplicate_lists.items():
file_to_keep = ask_user_to_choose_file(list(duplicates))
files_to_delete = [file for file in duplicates if file != file_to_keep]
for file_to_delete in files_to_delete:
if preview:
print(f"Would delete: {file_to_delete}")
else:
os.remove(file_to_delete)
print(f"Deleted: {file_to_delete}")
def ask_user_to_choose_file(files: List[str]) -> str:
"""
Ask the user to choose a file to keep.
Args:
files (List[str]): A list of file paths.
Returns:
str: The file path chosen by the user.
"""
files.sort() # sort the file paths alphabetically
shortest_file = min(files, key=len) # find the shortest file path
default_choice = files.index(shortest_file) + 1 # get the number of the shortest file path
while True:
for i, file in enumerate(files, start=1):
print(f"{i}: {file}")
choice = input(f"Which file do you want to keep? (1-{len(files)}, default is {default_choice}): ")
if not choice: # if the user presses enter with no input
return shortest_file
elif choice.isdigit() and 1 <= int(choice) <= len(files):
return files[int(choice) - 1]
else:
print(
f"Invalid input. Please enter a number between 1 and {len(files)} or press enter to choose the default.")
# Check if the folder path is provided as a command line argument
if len(sys.argv) < 2:
print("Usage: python script.py folder_path")
sys.exit(1)
# Get the folder path from the command line argument
folder_path = sys.argv[1]
# Check if the specified directory exists
if not os.path.isdir(folder_path):
print("Error: The specified directory does not exist.")
sys.exit(1)
def main():
# Parse command line arguments
parser = argparse.ArgumentParser(description='Rename files, prune duplicates, and delete empty directories.')
parser.add_argument('folder_path', help='the path to the folder to process')
args = parser.parse_args()
# Get the folder path from the command line argument
target_path = args.folder_path
# Check if the specified directory exists
if not os.path.isdir(target_path):
print("Error: The specified directory does not exist.")
sys.exit(1)
# Enable preview mode
preview_mode = True
# Rename files (preview mode)
# rename_files_in_directory(target_path, preview=preview_mode)
# Prune duplicates (preview mode)
delete_duplicate_files(target_path, preview=preview_mode)
# Delete empty directories (preview mode)
delete_empty_directories(target_path, preview=preview_mode)
while True:
# Prompt for confirmation before making changes
confirm_changes = input("Do you want to apply the changes? (yes/no): ")
if confirm_changes.lower() == 'yes':
# Rename files
# rename_files_in_directory(target_path, preview=False)
# Prune duplicates
delete_duplicate_files(target_path, preview=False)
# Delete empty directories
delete_empty_directories(target_path, preview=False)
break
elif confirm_changes.lower() == 'no':
print("No changes will be applied.")
break
else:
continue
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment