Skip to content

Instantly share code, notes, and snippets.

@Roxxers
Last active December 28, 2019 21:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Roxxers/0728b4959d2973c01323d804e26a6699 to your computer and use it in GitHub Desktop.
Save Roxxers/0728b4959d2973c01323d804e26a6699 to your computer and use it in GitHub Desktop.
Simple file to delete duplicates from ssdeep csv output ran on a google photos export
import re
import os
import sys
import csv
import datetime
DELETED_TEXT = "deleted {}: dupe of {}"
dupe_brackets = re.compile("\([0-9]\)*")
dupe_tilda = re.compile("\~[0-9]*")
def googledate_to_datetime(time: str) -> datetime.datetime:
# Get rid of any patterns like 2019-06-19 #2
time = time.split(" ")[0]
# Get rid of any patterns like 2019-06-19-20
time = "-".join(time.split("-")[0:3])
return datetime.datetime.strptime(time, "%Y-%m-%d")
def regex_check(filename: str) -> bool:
if dupe_brackets.search(filename):
return True
elif dupe_tilda.search(filename):
return True
else:
return False
def delete(file):
try:
os.remove(file)
except FileNotFoundError:
pass
with open(sys.argv[1], newline='') as fp:
files_deleted = 0
reader = csv.reader(fp)
for row in reader:
orig_filename = row[0].split("/")[-1]
dupe_filename = row[1].split("/")[-1]
if regex_check(dupe_filename):
delete(row[1])
print(DELETED_TEXT.format(dupe_filename, orig_filename))
elif regex_check(orig_filename):
delete(row[0])
print(DELETED_TEXT.format(orig_filename, dupe_filename))
else:
orig_folder_date = googledate_to_datetime(row[0].split("/")[-2])
dupe_folder_date = googledate_to_datetime(row[1].split("/")[-2])
if dupe_folder_date < orig_folder_date:
delete(row[0])
print(DELETED_TEXT.format(orig_filename, dupe_filename))
else:
delete(row[1])
print(DELETED_TEXT.format(dupe_filename, orig_filename))
files_deleted += 1
print("Deleted {} dupe files".format(files_deleted))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment