Skip to content

Instantly share code, notes, and snippets.

@badjano
Last active November 25, 2020 18:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save badjano/0723e742bd94a68cc2479717f79653a0 to your computer and use it in GitHub Desktop.
Save badjano/0723e742bd94a68cc2479717f79653a0 to your computer and use it in GitHub Desktop.
Search for all image duplicates in folders and subfolders, preserving the one with larger size
import hashlib
from glob import glob
import os
import cv2
import subprocess
def md5_for_file(path):
with open(path, 'rb') as f:
md5 = hashlib.md5()
md5.update(f.read())
return md5.digest()
folder_pattern = "datasets\\*\\*\\*.png"
size_map = {}
files = glob(folder_pattern)
last_index = len(files) - 1
for index, path in enumerate(files):
size = os.path.getsize(path)
image = cv2.imread(path)
hist = cv2.calcHist([image], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
hist = cv2.normalize(hist, hist).flatten()
h = md5_for_file(path)
size_map[path] = (size, hist, h)
if index % int(last_index / 300) == 0 or index == last_index:
msg = "\rReading files and caching data for comparison: %.02f%%" % (index * 100 / last_index)
print(msg, flush=True, end='' if index < last_index else "\n")
size_map = {k: v for k, v in sorted(size_map.items(), key=lambda item: -item[1][0])}
last = ()
duplicate = []
all_sims = {}
def show_image(path):
subprocess.call(f"start {path}", shell=True)
for i, key in enumerate(size_map):
size = size_map[key][0]
hist = size_map[key][1]
md5 = size_map[key][2]
if key not in duplicate:
if last and size == last[1] and md5 == last[3]:
duplicate.append(key)
else:
for j, k in enumerate(size_map):
if i != j and k not in duplicate:
dif_key = "%s %s" % tuple(sorted([key, k]))
if dif_key not in all_sims:
similarity = all_sims[dif_key] = cv2.compareHist(hist, size_map[k][1], cv2.HISTCMP_CORREL)
if similarity > .998: # threshold
duplicate.append(k)
last = (key, size, hist, md5)
if i % int(last_index / 300) == 0 or i == last_index:
msg = "\rComparing files: %.02f%%" % (i * 100 / last_index)
print(msg, flush=True, end='' if i < last_index else "\n")
remove_folder = "duplicates"
if not os.path.exists(remove_folder):
os.makedirs(remove_folder)
for i, file in enumerate(duplicate):
os.rename(file, f"{remove_folder}\\img_{i}.{file.split('.')[1]}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment