Skip to content

Instantly share code, notes, and snippets.

@ryanholbrook
Last active October 13, 2021 14:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ryanholbrook/959b1e85656d1f01f4aae21e991a674f to your computer and use it in GitHub Desktop.
Save ryanholbrook/959b1e85656d1f01f4aae21e991a674f to your computer and use it in GitHub Desktop.
Detect Duplicate Images
from joblib import delayed, Parallel
from pathlib import Path
from tqdm import tqdm
import hashlib
import imagehash
import pandas as pd
import PIL
import pybktree
from PIL import Image
HASH_DIST = 2 # max bit difference to consider images the same
def compute_hashes(path):
try:
img = Image.open(path)
except PIL.UnidentifiedImageError:
print(f"Can't open {path.stem}.")
return {
'PetID': path.stem,
'hash': str(hashlib.md5(img.tobytes()).hexdigest()),
'dhash': int(str(imagehash.dhash(img)), 16),
'phash': int(str(imagehash.phash(img)), 16),
}
def make_hashes(df_image):
photo_paths = df_image.loc[:, 'Path']
df_stats = Parallel(n_jobs=-1)(
delayed(compute_hashes)(path) for path in tqdm(photo_paths)
)
df_stats = pd.DataFrame(df_stats).set_index('PetID').sort_index()
return df_stats
df_image = make_image_df() # df_image contains image info / filepaths
df_stats = make_hashes(df_image)
df_image = df_image.join(df_stats)
def dist(a, b):
return pybktree.hamming_distance(a.phash, b.phash)
bktree_phash = pybktree.BKTree(dist, df_stats[['phash']].itertuples())
similarities = pd.Series([bktree_phash.find(x, n=HASH_DIST) for x in df_stats[['phash']].itertuples()])
similarities = similarities.apply(lambda y: tuple(map(lambda x: x[1].Index, y)))
duplicates = set(similarities[similarities.apply(len) > 1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment