Jeremiah-England/delete_dups.py

## delete_dups.py
"""
A script for deleting large numbers of duplicates from the output .txt file of
the open source Duplciate Files Finder application (https://sourceforge.net/projects/doubles/).

Given a list of duplicate files, the one with the shortest path (by character count)
is kept and all the rest are deleted. If several have the same length and there are
none shorter, then the least "alphabetically" is kept.

I used this to reduce a heavily duplicated picture archive from 121Gb to 57Gb. There
wasn't really a best way to decide which to delete so the "least path" logic above was
fine.
"""
import os


dff_output_path = "./duplicate-files-list.txt"

with open(dff_output_path, 'r') as dup_file:
    dups_str = dup_file.read()

dup_set_strs = dups_str.split("\n- ")
dup_sets = []

for dss in dup_set_strs:
    dup_sets.append(dss.split("\n")[1:])

for ds in dup_sets:
    ds = [s.strip().strip('"') for s in ds]
    min_len = len(min(ds, key=lambda s: len(s)))
    keep = min(filter(lambda s: len(s) == min_len, ds))
    print(keep)
    for file_path in ds:
        if file_path != keep:
            os.remove(file_path)
	"""
	A script for deleting large numbers of duplicates from the output .txt file of
	the open source Duplciate Files Finder application (https://sourceforge.net/projects/doubles/).

	Given a list of duplicate files, the one with the shortest path (by character count)
	is kept and all the rest are deleted. If several have the same length and there are
	none shorter, then the least "alphabetically" is kept.

	I used this to reduce a heavily duplicated picture archive from 121Gb to 57Gb. There
	wasn't really a best way to decide which to delete so the "least path" logic above was
	fine.
	"""
	import os


	dff_output_path = "./duplicate-files-list.txt"

	with open(dff_output_path, 'r') as dup_file:
	dups_str = dup_file.read()

	dup_set_strs = dups_str.split("\n- ")
	dup_sets = []

	for dss in dup_set_strs:
	dup_sets.append(dss.split("\n")[1:])

	for ds in dup_sets:
	ds = [s.strip().strip('"') for s in ds]
	min_len = len(min(ds, key=lambda s: len(s)))
	keep = min(filter(lambda s: len(s) == min_len, ds))
	print(keep)
	for file_path in ds:
	if file_path != keep:
	os.remove(file_path)