isagalaev/img.py

## img.py
#!/usr/bin/env python
# coding: utf-8

import os
import random
import glob

from PIL import Image
import numpy


BLOCK_SIZE = 20
THRESHOLD = 60
WIDTH = 200
MAX_DISTANCE = 220


def image_data(filename):
    """
    Get data from image ready for comparison
    """
    img = Image.open(filename).resize((BLOCK_SIZE, BLOCK_SIZE), Image.BILINEAR)
    return numpy.array([sum(x) for x in img.getdata()])

def distance(data1, data2):
    """
    Logical distance between two images on a scale 0..400
    """
    return sum(1 for x in data1 - data2 if abs(x) > THRESHOLD)

def duplicates(dirname):
    """
    Finds duplicate images in a directory.
    All files must be *.jpg.
    Returns an iterator of image groups ([], [], ... [])
    """
    files = glob.glob(os.path.join(dirname, '*.jpg'))
    images = [(f, image_data(f)) for f in files]
    random.shuffle(images)

    for filename, data in images:
        distances = [(distance(data, d), f) for f, d in images]
        yield sorted([(dist, f) for dist, f in distances if dist < MAX_DISTANCE])

def html_group(group):
    return ''.join(
        '<img src="%s" width="%s"/>%s' % (os.path.basename(f), WIDTH, dist)
        for dist, f in group
    )

def html(groups):
    """
    Generates HTML from groups of image duplicates
    """
    body = '<hr/>'.join(html_group(g) for g in groups)
    return '<html><body>%s<hr/></body></html>' % body


if __name__ == '__main__':
    print(html(duplicates('/home/maniac/Desktop/4554182')))
	#!/usr/bin/env python
	# coding: utf-8

	import os
	import random
	import glob

	from PIL import Image
	import numpy


	BLOCK_SIZE = 20
	THRESHOLD = 60
	WIDTH = 200
	MAX_DISTANCE = 220


	def image_data(filename):
	"""
	Get data from image ready for comparison
	"""
	img = Image.open(filename).resize((BLOCK_SIZE, BLOCK_SIZE), Image.BILINEAR)
	return numpy.array([sum(x) for x in img.getdata()])

	def distance(data1, data2):
	"""
	Logical distance between two images on a scale 0..400
	"""
	return sum(1 for x in data1 - data2 if abs(x) > THRESHOLD)

	def duplicates(dirname):
	"""
	Finds duplicate images in a directory.
	All files must be *.jpg.
	Returns an iterator of image groups ([], [], ... [])
	"""
	files = glob.glob(os.path.join(dirname, '*.jpg'))
	images = [(f, image_data(f)) for f in files]
	random.shuffle(images)

	for filename, data in images:
	distances = [(distance(data, d), f) for f, d in images]
	yield sorted([(dist, f) for dist, f in distances if dist < MAX_DISTANCE])

	def html_group(group):
	return ''.join(
	'<img src="%s" width="%s"/>%s' % (os.path.basename(f), WIDTH, dist)
	for dist, f in group
	)

	def html(groups):
	"""
	Generates HTML from groups of image duplicates
	"""
	body = '<hr/>'.join(html_group(g) for g in groups)
	return '<html><body>%s<hr/></body></html>' % body


	if __name__ == '__main__':
	print(html(duplicates('/home/maniac/Desktop/4554182')))