Skip to content

Instantly share code, notes, and snippets.

@NiklasRosenstein
Created December 15, 2015 02:26
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save NiklasRosenstein/d85a78ab45d79a431c93 to your computer and use it in GitHub Desktop.
Save NiklasRosenstein/d85a78ab45d79a431c93 to your computer and use it in GitHub Desktop.
# This script creates an image hash database and can then be used
# to copy image files to a directory, but only when they are not
# already in that directory (by the image hash).
from __future__ import print_function
from itertools import islice, izip
from six.moves import range
import argparse
import cPickle
import errno
import hashlib
import os
import PIL.Image
import shutil
def dhash(image, hash_size=8):
""" Compute a hash from a PIL *image*.
Thanks to http://blog.iconfinder.com/detecting-duplicate-images-using-python/ . """
# Grayscale and shrink the image in one step.
image = image.convert('L').resize(
(hash_size + 1, hash_size),
PIL.Image.ANTIALIAS,
)
pixels = list(image.getdata())
# Compare adjacent pixels.
difference = []
for row in xrange(hash_size):
for col in xrange(hash_size):
pixel_left = image.getpixel((col, row))
pixel_right = image.getpixel((col + 1, row))
difference.append(pixel_left > pixel_right)
# Convert the binary array to a hexadecimal string.
decimal_value = 0
hex_string = []
for index, value in enumerate(difference):
if value:
decimal_value += 2**(index % 8)
if (index % 8) == 7:
hex_string.append(hex(decimal_value)[2:].rjust(2, '0'))
decimal_value = 0
return ''.join(hex_string)
def dhash_file(filename, hash_size=8):
return dhash(PIL.Image.open(filename))
def main():
parser = argparse.ArgumentParser()
parser.add_argument('dest_dir')
parser.add_argument('--hash-size', type=int, default=8)
parser.add_argument('--dry-copy', action='store_true')
parser.add_argument('--update-db', action='store_true')
parser.add_argument('--db-file', default='_image_hash_db')
parser.add_argument('--copy-from')
parser.add_argument('--override-dest')
args = parser.parse_args()
if args.update_db:
print("Updating Image Hash Database...")
database = {}
for fn in os.listdir(args.dest_dir):
fnfull = os.path.join(args.dest_dir, fn)
try:
database[dhash_file(fnfull, args.hash_size)] = fn
except IOError:
pass
with open(args.db_file, 'wb') as fp:
cPickle.dump(database, fp)
else:
if not os.path.isfile(args.db_file):
print("error: Image Hash Database does not exist, use --update-db")
return errno.ENOENT
with open(args.db_file, 'rb') as fp:
database = cPickle.load(fp)
assert isinstance(database, dict)
if args.copy_from:
for fn in os.listdir(args.copy_from):
fnfull = os.path.join(args.copy_from, fn)
try:
hash_value = dhash_file(fnfull, args.hash_size)
except IOError:
continue
if hash_value in database:
print("{0}: image already exists: {1}".format(fn, database[hash_value]))
else:
print("{0}: copying ...".format(fn))
if not args.dry_copy:
if args.override_dest:
dest_base = os.path.join(args.override_dest, fn)
else:
dest_base = os.path.join(args.dest_dir, fn)
# Make sure the destination filename does not exist.
index = 0
dest = dest_base
while os.path.isfile(dest):
print(" warning: {0} already exists".format(os.path.basename(dest)))
dest = dest_base + '_{:0>5}'.format(index)
index += 1
shutil.copyfile(fnfull, dest)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment