Skip to content

Instantly share code, notes, and snippets.

@ezirmusitua
Last active November 30, 2022 21:41
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ezirmusitua/1aa47567ad4ebd5679f9e3df09585e17 to your computer and use it in GitHub Desktop.
Save ezirmusitua/1aa47567ad4ebd5679f9e3df09585e17 to your computer and use it in GitHub Desktop.
[Group similar images] Group similar images using phash algorithm #python #image #algorithm #tools
"""
This script use to group similiar images from source folder
and save to the same output folder, base on the perceptual hash algorithm.
Work on Python3.6 and win10-build-17074
Prerequest: Pillow
Author: jferroal@gmail.com
"""
import hashlib
import mimetypes
import os
import shutil
from PIL import Image
Default_Resize_Width = 80
Default_Resize_Height = 80
Same_Image_Value = 600
def get_images_name(dir):
"""
Get all images filename and file path in given directory
:param dir: input directory to get images
:return: filenames
:rtype: a list of tuple(filename, file_path)
"""
filenames = list()
for f in os.listdir(dir):
content_type = mimetypes.guess_type(f)
if content_type[0] and not content_type[0].startswith('image'): continue
file_path = os.path.join(dir, f)
if os.path.isfile(file_path):
filenames.append((f, file_path))
return filenames
def get_gray_scale_image_data(path, resize_width=Default_Resize_Width,
resize_heith=Default_Resize_Height):
"""
Get gray scale image data in given path
:param path: The image source path
:param resize_width: After read image resize width
:param resize_height: After read image resize height
:return: image data
:rtype: http://pillow.readthedocs.io/en/latest/reference/Image.html#PIL.Image.Image.getdata
"""
im = Image.open(path)
smaller_image = im.resize((resize_width, resize_heith))
grayscale_image = smaller_image.convert('L')
return grayscale_image.getdata()
def hash_image(path, resize_width=Default_Resize_Width,
resize_heith=Default_Resize_Height):
"""
Hash given image in path
:param path: The image source path
:param resize_width: After read image resize width
:param resize_height: After read image resize height
:return: integer hash value
"""
hash_string = ""
pixels = list(get_gray_scale_image_data(path, resize_width, resize_heith))
for row in range(1, len(pixels) + 1):
# compare the grey in same row
if row % resize_width:
if pixels[row - 1] > pixels[row]:
hash_string += '1'
else:
hash_string += '0'
return int(hash_string, 2)
def difference_hash(dhash1, dhash2):
"""
Calculate the difference of two image hash
:param: dhash1: image hash value 1
:param: dhash2: image hash value 2
:return: 1 count
"""
return bin(dhash1 ^ dhash2).count('1')
class ImageToGroup(object):
"""
Image to group class
"""
def __init__(self, filename, path):
# image hash value
self.hash = hash_image(path)
# image original path
self.path = path
# image filename
self.filename = filename
# image path(for saving grouped image)
self.hash_path = hashlib.md5(str(self.hash).encode()).hexdigest()
# parent image ref
self.root = None
def is_same_group(self, i2):
return difference_hash(self.hash, i2.hash) < Same_Image_Value
@property
def root_rpath(self):
# get image root path, if self is root, get self path
if not self.root: return self.hash_path
return self.root.hash_path
def create_root_dir(self, base):
# create directory if not exists
if not os.path.exists(os.path.join(base, self.root_rpath)):
os.makedirs(os.path.join(base, self.root_rpath))
def copy_to_root_dir(self, base):
# copy image from source to root directory
print(os.path.join(base, self.root_rpath, self.filename))
shutil.copyfile(self.path, os.path.join(base, self.root_rpath, self.filename))
def copy(self, base):
# do copy work
self.create_root_dir(base)
self.copy_to_root_dir(base)
class Groupter(object):
"""
Do the group work
"""
def __init__(self, images_to_group):
self.images_to_group = images_to_group
def group(self):
# group all image
for i in range(0, len(self.images_to_group)):
for j in range(i + 1, len(self.images_to_group)):
i1 = self.images_to_group[i]
i2 = self.images_to_group[j]
if not i1.root and i1.is_same_group(i2):
i2.root = i1
return self
def dump_group(self, path):
# save all grouped images
for image in self.images_to_group:
image.copy(path)
def remove(self, path):
for image in self.images_to_group:
if not image.root:
shutil.copyfile(image.path, os.path.join(path, image.filename))
def main(indir, odir, action='group'):
# main entrypoint
# first, get all images
filenames = get_images_name(indir)
# second, using images to create imagesToGroup and initilize groupter
groupter = Groupter([ImageToGroup(f[0], f[1]) for f in filenames])
# do group and do save
groupter.group()
if action == 'group':
groupter.dump_group(odir)
else:
groupter.remove(odir)
if __name__ == '__main__':
import sys
images_dir = sys.argv[1]
output_dir = sys.argv[2]
action = sys.argv[3]
main(images_dir, output_dir, action)
@ezirmusitua
Copy link
Author

How to use:

python group-same-image.py "/images/dir" "/grouped/to/save"  

@Saafan
Copy link

Saafan commented Nov 30, 2022

How to use:

python group-same-image.py "/images/dir" "/grouped/to/save"  

python group-same-image.py "/images/dir" "/grouped/to/save" group

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment