Skip to content

Instantly share code, notes, and snippets.

@bo858585
Forked from isagalaev/img.py
Last active December 16, 2015 04:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bo858585/5377492 to your computer and use it in GitHub Desktop.
Save bo858585/5377492 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf-8
import os
import random
import glob
from PIL import Image
import numpy
import math
BLOCK_SIZE = 20
THRESHOLD = 60
WIDTH = 200
MAX_DISTANCE = 80
# Maximum distance between two pixels
PIXELS_MAX_DISTANCE = math.sqrt(255*255 + 255*255 + 255*255)
def image_data(filename):
"""
Get data from image ready for comparison
"""
img = Image.open(filename).resize((BLOCK_SIZE, BLOCK_SIZE), Image.BILINEAR)
return numpy.array([x for x in img.getdata()])
def distance(data1, data2):
"""
Logical distance between two images on a scale 0..100
"""
# Sum of distances between corresponding pixels at matrixes divided into max distance between 2 pixels
return sum(math.sqrt(x[0]*x[0] + x[1]*x[1] + x[2]*x[2]) for x in data1 - data2)/PIXELS_MAX_DISTANCE
def duplicates(dirname):
"""
Finds duplicate images in a directory.
All files must be *.jpg.
Returns an iterator of image groups ([], [], ... [])
"""
files = glob.glob(os.path.join(dirname, '*.jpg'))
images = [(f, image_data(f)) for f in files]
random.shuffle(images)
for filename, data in images:
distances = [(distance(data, d), f) for f, d in images]
yield sorted([(dist, f) for dist, f in distances if dist < MAX_DISTANCE])
def html_group(group):
return ''.join(
'<img src="%s" width="%s"/>%s' % (os.path.basename(f), WIDTH, dist)
for dist, f in group
)
def html(groups):
"""
Generates HTML from groups of image duplicates
"""
body = '<hr/>'.join(html_group(g) for g in groups)
return '<html><body>%s<hr/></body></html>' % body
if __name__ == '__main__':
print(html(duplicates('/home/bo858585/Изображения')))
@bo858585
Copy link
Author

Разность матриц приближается непрерывной функцией (суммой разностей между пикселами, threshhold не используется) взамен дискретной (суммы единиц), которая была изначально. Такое приближение более точное.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment