Skip to content

Instantly share code, notes, and snippets.

@edison12a
Created July 3, 2021 01:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save edison12a/eb97cd75fc1b84d2a177048aa3007328 to your computer and use it in GitHub Desktop.
Save edison12a/eb97cd75fc1b84d2a177048aa3007328 to your computer and use it in GitHub Desktop.
'''
iterates over files in a directory, finds photos and compares them for similarity
if 2 photos are similar, the duplicate is deleted
'''
from PIL import Image
import imagehash
import os
import time
start = time.time()
RECURSIVE_ROOT = '/mnt/c/xxx/yyy/zzz'
png_photos = {}
jpg_photos = {}
def hash_it(photo_path):
# return a hash for the photo
try:
with Image.open(photo_path) as img:
return imagehash.average_hash(img)
except Exception as e:
# print(e)
return 0
# recursively loop over all files and folders in the given path
for current_dir_path, current_subdirs, current_files in os.walk(RECURSIVE_ROOT):
for aFile in current_files:
txt_file_path = str(os.path.join(current_dir_path, aFile))
# update a file path to the appropriate dictonary
# with the value being its hash value
if aFile.endswith('jpg'):
jpg_photos[txt_file_path] = hash_it(txt_file_path)
elif aFile.endswith('png'):
png_photos[txt_file_path] = hash_it(txt_file_path)
# record time taken
print('\n\n\nHashing out', time.time()-start, '\n\n\n')
cutoff = 5 # maximum bits that could be different between the hashes.
photos_dicts = [png_photos, jpg_photos]
for photo_dic in photos_dicts:
# get list of keys / file_names in the dictionary
fnames = list(photo_dic)
for fname in fnames:
hash0 = photo_dic[fname]
# delete the key from the dictionary so that its not iterated over again
del photo_dic[fname]
# iterate over the rest of the dictionary and compare hashes to the key hash0
for remaining_fname in photo_dic:
hash1 = photo_dic[remaining_fname]
try:
# check if the difference between the hashes is less than the cutoff
if hash0 - hash1 < cutoff:
print('images are similar', fname, remaining_fname)
# delete the photo from the dictionary if it is similar Enough to the key hash0
os.remove(remaining_fname)
else:
pass
# print('images are not similar')
except:
pass
# record total time taken
print('\n\n\Comparing photos', time.time()-start, '\n\n\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment