Skip to content

Instantly share code, notes, and snippets.

@asdkant
Created January 19, 2021 01:45
Show Gist options
  • Save asdkant/e9bd2c2c1564bb01f98b40df8d572557 to your computer and use it in GitHub Desktop.
Save asdkant/e9bd2c2c1564bb01f98b40df8d572557 to your computer and use it in GitHub Desktop.
Find duplicate raw files (works on Linux)
#!/usr/bin/python
import sys
from sys import argv
import pyexiv2
import os
from progress.bar import IncrementalBar
help_message = f"""Find duplicate images in a list provided (using metadata)
Usage:
find-raw-dupes list_of_files.txt results.txt
The list of files should have an item per line. Works with absolute paths.
"""
if '-h' in argv or '--help' in argv:
print(help_message)
exit(0)
fname_list = argv[1]
fname_out = argv[2] if len(argv) > 2 else None
with open(fname_list, 'r') as f:
paths = f.read().splitlines()
di, err = {}, []
with IncrementalBar('checking...',max=len(paths)) as bar:
for f in paths:
try:
md = pyexiv2.metadata.ImageMetadata(f)
md.read()
fdate = md.__getitem__('Exif.Photo.DateTimeOriginal').value
fhash1 = hash("\n".join(f"{k}\t{str(md[k])}" for k in sorted(md.keys())))
fhash = f"{fdate}:{fhash1}"
di.setdefault(fhash,[])
di[fhash].append(f)
except:
err.append(f)
bar.next()
if err:
if fname_out:
fname_err = f"{fname_out}_err.log"
print(f'there were some problematic files, will log to {fname_err}', file=sys.stderr)
ferr = open(fname_err, 'a')
else:
ferr = sys.stderr
for l in err:
print(f"ERROR reading {l}",file=ferr)
if fname_out: ferr.close()
fout = open(fname_out,'a') if fname_out else sys.stdout
for r in di.values():
if len(r) > 1: print("\t".join(sorted(r)), file=fname_out)
if fname_out: fout.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment