Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
iterates over files in a directory, finds photos and compares them for similarity
if 2 photos are similar, the duplicate is deleted
from PIL import Image
import imagehash
import os
import time
start = time.time()
RECURSIVE_ROOT = '/mnt/c/xxx/yyy/zzz'
png_photos = {}
jpg_photos = {}
def hash_it(photo_path):
# return a hash for the photo
with as img:
return imagehash.average_hash(img)
except Exception as e:
# print(e)
return 0
# recursively loop over all files and folders in the given path
for current_dir_path, current_subdirs, current_files in os.walk(RECURSIVE_ROOT):
for aFile in current_files:
txt_file_path = str(os.path.join(current_dir_path, aFile))
# update a file path to the appropriate dictonary
# with the value being its hash value
if aFile.endswith('jpg'):
jpg_photos[txt_file_path] = hash_it(txt_file_path)
elif aFile.endswith('png'):
png_photos[txt_file_path] = hash_it(txt_file_path)
# record time taken
print('\n\n\nHashing out', time.time()-start, '\n\n\n')
cutoff = 5 # maximum bits that could be different between the hashes.
photos_dicts = [png_photos, jpg_photos]
for photo_dic in photos_dicts:
# get list of keys / file_names in the dictionary
fnames = list(photo_dic)
for fname in fnames:
hash0 = photo_dic[fname]
# delete the key from the dictionary so that its not iterated over again
del photo_dic[fname]
# iterate over the rest of the dictionary and compare hashes to the key hash0
for remaining_fname in photo_dic:
hash1 = photo_dic[remaining_fname]
# check if the difference between the hashes is less than the cutoff
if hash0 - hash1 < cutoff:
print('images are similar', fname, remaining_fname)
# delete the photo from the dictionary if it is similar Enough to the key hash0
# print('images are not similar')
# record total time taken
print('\n\n\Comparing photos', time.time()-start, '\n\n\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment