Skip to content

Instantly share code, notes, and snippets.

@daleysoftware
Created January 2, 2018 22:06
Show Gist options
  • Save daleysoftware/5891cecadc068e7b966ba065110278e7 to your computer and use it in GitHub Desktop.
Save daleysoftware/5891cecadc068e7b966ba065110278e7 to your computer and use it in GitHub Desktop.
Delete duplicate photos in a large collection.
import glob
import sys
import os
import hashlib
import collections
def log(message):
print(message)
def get_image_paths_under(directory):
log("Getting directory listing for \"%s\"" % directory)
listing = [x for x in glob.glob(os.path.join(directory, '*'))]
files = [f for f in listing if os.path.isfile(f)]
directories = [f for f in listing if os.path.isdir(f)]
result = []
for d in directories:
result.extend(get_image_paths_under(d))
if len(files) == 0 and len(directories) == 0:
log("Warning: empty directory \"%s\"" % directory)
result.extend(files)
return result
def remove_duplicates(images_directory):
image_paths = get_image_paths_under(images_directory)
hashes = collections.defaultdict(list)
for image_path in image_paths:
log("Hashing \"%s\"" % image_path)
with open(image_path, 'rb') as afile:
hasher = hashlib.md5()
buf = afile.read()
hasher.update(buf)
h = hasher.hexdigest()
hashes[h].append(image_path)
for h in hashes.keys():
files = hashes[h]
if len(files) > 1:
log("Duplicates: %s" % files)
msg = input("Delete all but first or skip? [D/S]: ")
if msg == "D":
for f in files[1:]:
log("Removing %s" % f)
os.unlink(f)
else:
log("Skipping.")
if __name__ == '__main__':
if len(sys.argv) != 2:
print("Usage: %s <images_directory>")
sys.exit(1)
remove_duplicates(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment