Skip to content

Instantly share code, notes, and snippets.

@nrrb
Last active August 29, 2015 14:04
Show Gist options
  • Save nrrb/9212e4e95485a2d533ba to your computer and use it in GitHub Desktop.
Save nrrb/9212e4e95485a2d533ba to your computer and use it in GitHub Desktop.
Image Metadata Extraction
# -*- coding: utf-8 -*-
# This doesn't use any Python libraries for extracting image metadata. Instead,
# it relies on the very handy exiftool utility and parsing the results.
# exiftool: http://www.sno.phy.queensu.ca/~phil/exiftool/
from __future__ import unicode_literals
import unicodedata
import subprocess
import csv
import os
import re
from pymongo import MongoClient
def normalize_string(my_string):
if my_string is None:
return ''
if type(my_string) != unicode:
my_string = my_string.decode('utf-8', errors='ignore')
return unicodedata.normalize('NFKD', my_string).encode('ASCII', 'ignore')
def extension(filename):
return filename.split('.')[-1].lower()
def is_image(filename):
return extension(filename) in ['jpg', 'jpeg', 'png']
def is_original_wp_image(filename):
return not re.match('.*\-\d+x\d+\.jpg', filename)
def get_metadata_by_type(filename, metadata_type='EXIF'):
metadata = {}
p = subprocess.Popen(['exiftool', '-TAG', "-{0}:*".format(metadata_type), filename], stdout=subprocess.PIPE)
out, err = map(normalize_string, p.communicate())
output_lines = out.split('\n')
output_lines = filter(lambda s: len(s) > 0, output_lines)
for line in output_lines:
k = metadata_type + ' ' + line.split(':')[0].strip()
v = ''.join(line.split(':')[1:]).strip()
metadata[k] = v
return metadata
def get_all_metadata(filename):
metadata = {}
metadata.update(get_metadata_by_type(filename, 'EXIF'))
metadata.update(get_metadata_by_type(filename, 'IPTC'))
metadata.update(get_metadata_by_type(filename, 'XMP'))
return metadata
if __name__ == "__main__":
client = MongoClient()
db = client.image_metadata
wp_uploads = db.wp_uploads
for rootdir, dirs, filenames in os.walk('.'):
for filename in filenames:
if is_image(filename) and is_original_wp_image(filename):
parent_folder = rootdir.split('/')[-1]
full_path = os.path.abspath(os.path.join(rootdir, filename))
print(full_path)
this_metadata = {'path': full_path}
this_metadata.update(get_all_metadata(full_path))
wp_uploads.insert(this_metadata)
@nrrb
Copy link
Author

nrrb commented Jul 24, 2014

Put summarize.py in the root folder where your images are and run it there. It will connect to a mongo instance on the localhost by default.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment