Skip to content

Instantly share code, notes, and snippets.

@bo858585
Forked from isagalaev/img.py
Last active December 11, 2015 09:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bo858585/4579268 to your computer and use it in GitHub Desktop.
Save bo858585/4579268 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf-8
import os
import random
import glob
from PIL import Image
import numpy
import math
BLOCK_SIZE = 20
THRESHOLD = 60
WIDTH = 200
MAX_DISTANCE = 220
def image_data(filename):
"""
Get data from image ready for comparison
"""
img = Image.open(filename).resize((BLOCK_SIZE, BLOCK_SIZE), Image.BILINEAR)
return numpy.array([x for x in img.getdata()])
def distance(data1, data2):
"""
Logical distance between two images on a scale 0..400
"""
return sum(1 for x in data1 - data2 if math.sqrt(x[0]*x[0] + x[1]*x[1] + x[2]*x[2]) > THRESHOLD)
def duplicates(dirname):
"""
Finds duplicate images in a directory.
All files must be *.jpg.
Returns an iterator of image groups ([], [], ... [])
"""
files = glob.glob(os.path.join(dirname, '*.jpg'))
images = [(f, image_data(f)) for f in files]
random.shuffle(images)
for filename, data in images:
distances = [(distance(data, d), f) for f, d in images]
yield sorted([(dist, f) for dist, f in distances if dist < MAX_DISTANCE])
def html_group(group):
return ''.join(
'<img src="%s" width="%s"/>%s' % (os.path.basename(f), WIDTH, dist)
for dist, f in group
)
def html(groups):
"""
Generates HTML from groups of image duplicates
"""
body = '<hr/>'.join(html_group(g) for g in groups)
return '<html><body>%s<hr/></body></html>' % body
if __name__ == '__main__':
print(html(duplicates('/home/bo858585/Изображения')))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment