Last active
October 13, 2021 14:44
-
-
Save ryanholbrook/959b1e85656d1f01f4aae21e991a674f to your computer and use it in GitHub Desktop.
Detect Duplicate Images
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from joblib import delayed, Parallel | |
from pathlib import Path | |
from tqdm import tqdm | |
import hashlib | |
import imagehash | |
import pandas as pd | |
import PIL | |
import pybktree | |
from PIL import Image | |
HASH_DIST = 2 # max bit difference to consider images the same | |
def compute_hashes(path): | |
try: | |
img = Image.open(path) | |
except PIL.UnidentifiedImageError: | |
print(f"Can't open {path.stem}.") | |
return { | |
'PetID': path.stem, | |
'hash': str(hashlib.md5(img.tobytes()).hexdigest()), | |
'dhash': int(str(imagehash.dhash(img)), 16), | |
'phash': int(str(imagehash.phash(img)), 16), | |
} | |
def make_hashes(df_image): | |
photo_paths = df_image.loc[:, 'Path'] | |
df_stats = Parallel(n_jobs=-1)( | |
delayed(compute_hashes)(path) for path in tqdm(photo_paths) | |
) | |
df_stats = pd.DataFrame(df_stats).set_index('PetID').sort_index() | |
return df_stats | |
df_image = make_image_df() # df_image contains image info / filepaths | |
df_stats = make_hashes(df_image) | |
df_image = df_image.join(df_stats) | |
def dist(a, b): | |
return pybktree.hamming_distance(a.phash, b.phash) | |
bktree_phash = pybktree.BKTree(dist, df_stats[['phash']].itertuples()) | |
similarities = pd.Series([bktree_phash.find(x, n=HASH_DIST) for x in df_stats[['phash']].itertuples()]) | |
similarities = similarities.apply(lambda y: tuple(map(lambda x: x[1].Index, y))) | |
duplicates = set(similarities[similarities.apply(len) > 1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment