This Python script identifies and removes duplicate files in a specified directory based on the Levenshtein distance between their names. Only the first file in each group of similar files is kept, and the rest are deleted. The script excludes directories and only processes files.
- Python 3.x
colorama
library for colored console output
Install the colorama
library using pip:
pip install colorama
import os
from collections import defaultdict
import colorama
from colorama import Fore, Style
def levenshtein(s1, s2):
"""Calculate the Levenshtein distance between two strings."""
if len(s1) < len(s2):
return levenshtein(s2, s1)
if len(s1) == 0:
return len(s2)
previous_row = range(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
def main():
colorama.init()
folder_path = 'D:\\RetroBat\\roms\\mame'
files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
grouped_files = defaultdict(list)
print(Fore.CYAN + 'Analyzing files for grouping...' + Style.RESET_ALL)
total_files = len(files)
for index, file in enumerate(files):
base_name = os.path.splitext(file)[0]
grouped = False
for key in grouped_files.keys():
if levenshtein(base_name, key) <= 5:
grouped_files[key].append(file)
grouped = True
break
if not grouped:
grouped_files[base_name].append(file)
print(Fore.GREEN + 'Processing groups and removing duplicates...' + Style.RESET_ALL)
for group, files in grouped_files.items():
if len(files) > 1:
print(Fore.YELLOW + f"Group based on '{group}': {files}" + Style.RESET_ALL)
keeper = files[0] # Keep the first file
print(Fore.GREEN + f"Keeping: {keeper}" + Style.RESET_ALL)
for file_to_delete in files[1:]:
file_path = os.path.join(folder_path, file_to_delete)
print(Fore.RED + f"Deleting: {file_to_delete}" + Style.RESET_ALL)
os.remove(file_path) # Now uncommented to actually delete the files
print(Fore.GREEN + 'Duplication removal process completed.' + Style.RESET_ALL)
colorama.deinit()
if __name__ == "__main__":
main()
-
Save the script as
manage_files.py
. -
Open a terminal or command prompt.
-
Navigate to the directory where the script is saved.
-
Run the script with Python:
python manage_files.py
Make sure to have backups of your files before running the script, especially if running it on important data. Test the script on a sample directory to ensure it behaves as expected.