Created
July 3, 2021 01:07
-
-
Save edison12a/eb97cd75fc1b84d2a177048aa3007328 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
iterates over files in a directory, finds photos and compares them for similarity | |
if 2 photos are similar, the duplicate is deleted | |
''' | |
from PIL import Image | |
import imagehash | |
import os | |
import time | |
start = time.time() | |
RECURSIVE_ROOT = '/mnt/c/xxx/yyy/zzz' | |
png_photos = {} | |
jpg_photos = {} | |
def hash_it(photo_path): | |
# return a hash for the photo | |
try: | |
with Image.open(photo_path) as img: | |
return imagehash.average_hash(img) | |
except Exception as e: | |
# print(e) | |
return 0 | |
# recursively loop over all files and folders in the given path | |
for current_dir_path, current_subdirs, current_files in os.walk(RECURSIVE_ROOT): | |
for aFile in current_files: | |
txt_file_path = str(os.path.join(current_dir_path, aFile)) | |
# update a file path to the appropriate dictonary | |
# with the value being its hash value | |
if aFile.endswith('jpg'): | |
jpg_photos[txt_file_path] = hash_it(txt_file_path) | |
elif aFile.endswith('png'): | |
png_photos[txt_file_path] = hash_it(txt_file_path) | |
# record time taken | |
print('\n\n\nHashing out', time.time()-start, '\n\n\n') | |
cutoff = 5 # maximum bits that could be different between the hashes. | |
photos_dicts = [png_photos, jpg_photos] | |
for photo_dic in photos_dicts: | |
# get list of keys / file_names in the dictionary | |
fnames = list(photo_dic) | |
for fname in fnames: | |
hash0 = photo_dic[fname] | |
# delete the key from the dictionary so that its not iterated over again | |
del photo_dic[fname] | |
# iterate over the rest of the dictionary and compare hashes to the key hash0 | |
for remaining_fname in photo_dic: | |
hash1 = photo_dic[remaining_fname] | |
try: | |
# check if the difference between the hashes is less than the cutoff | |
if hash0 - hash1 < cutoff: | |
print('images are similar', fname, remaining_fname) | |
# delete the photo from the dictionary if it is similar Enough to the key hash0 | |
os.remove(remaining_fname) | |
else: | |
pass | |
# print('images are not similar') | |
except: | |
pass | |
# record total time taken | |
print('\n\n\Comparing photos', time.time()-start, '\n\n\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment