Skip to content

Instantly share code, notes, and snippets.

@gromgull
Last active December 19, 2015 16:58
Show Gist options
  • Save gromgull/5987234 to your computer and use it in GitHub Desktop.
Save gromgull/5987234 to your computer and use it in GitHub Desktop.
finding duplicate images by name and exif data exif parser from https://github.com/ianare/exif-py
def getdate(f):
try:
with file(f) as x:
exif=EXIF.process_file(x, stop_tag="DateTimeOriginal")
return exif["EXIF DateTimeOriginal"].values
except:
return None
def noext(f):
""" remove ALL extensions from filename (os.path.splitext only removes last) """
return f[:f.index('.')]
newfiles = [ '1/IMGP1234.JPG', .... ]
oldfiles = [ 'archive/2009/ESWC/... ' ... ]
basenew=collections.defaultdict(list)
for x in newfiles:
basenew[noext(os.path.basename(x)).lower()].append(x)
baseold=collections.defaultdict(list)
for x in oldfiles:
baseold[noext(os.path.basename(x)).lower()].append(x)
match = []
for base,files in basenew.items():
if base in baseold:
for f in files:
try:
newdate=getdate(f)
match+=[(f,x) for x in baseold[base] if getdate(x)==newdate]
except IOError,e:
print e
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment