Skip to content

Instantly share code, notes, and snippets.

@cjdd3b
Last active July 27, 2023 08:16
Show Gist options
  • Save cjdd3b/ca9b0ebef0395a5f7793 to your computer and use it in GitHub Desktop.
Save cjdd3b/ca9b0ebef0395a5f7793 to your computer and use it in GitHub Desktop.
Example of perceptual hashing for near-duplicate image detection
'''
cluster.py
Uses the Hamming distance between perceptual hashes to surface near-duplicate
images.
To install and run:
1. pip install imagehash
2. Put some .dat files in a folder someplace (script assumes ./data/imgs/*.dat)
3. python cluster.py
You can adjust the HAMMING_THRESH global to be larger if you want near, but not
identical, dupes.
More information:
Hamming distance: https://en.wikipedia.org/wiki/Hamming_distance
dhash: http://blog.iconfinder.com/detecting-duplicate-images-using-python/
'''
import base64, glob, cStringIO, imagehash, itertools, time
from PIL import Image
########## GLOBALS ##########
# Where all your ads are
IMAGE_DIR = './data/imgs/*.dat'
# The image you want to test for dupes
TEST_IMAGE = './data/test/noname-espn.go.com-2016-02-09T17.05.13-05.00-965774372.dat'
# Maximum Hamming distance required to determine a match (0.0 - 1.0)
HAMMING_THRESH = 0.0
########## DISTANCE METRICS ##########
def hamming(s1, s2):
'''
Calculate the normalized Hamming distance between two strings.
'''
assert len(s1) == len(s2)
return float(sum(c1 != c2 for c1, c2 in zip(s1, s2))) / float(len(s1))
########## IMAGE PROCESSING ##########
def convert_image(imgpath):
'''
Convert image from base64-encoded string to PIL image object.
'''
imgstring = open(imgpath, 'r').read()
try:
decoded = cStringIO.StringIO(base64.b64decode(imgstring.replace('data:image/jpeg;base64,', '')))
return Image.open(decoded)
except IOError:
print 'Could not decode %s' % imgpath
return None
def get_image_hashes(path):
'''
Iterator that returns hashes and Image objects for all images in a directory.
'''
for f in glob.iglob(path):
image = convert_image(f)
if image:
yield (image, str(imagehash.dhash(image)))
########## MAIN ###########
if __name__ == '__main__':
# Load up a test image to look for dupes against
test_image = convert_image(TEST_IMAGE)
test_hash = str(imagehash.dhash(test_image))
# Loop over all the ads to look for dupes, base on HAMMING_THRESH
for obj in get_image_hashes(IMAGE_DIR):
im, h = obj
# This will show the other Prius ads because a Hamming distance of 0 means
# two things are identical.
if hamming(h, test_hash) <= HAMMING_THRESH:
im.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment