Last active
August 29, 2015 14:04
-
-
Save nrrb/9212e4e95485a2d533ba to your computer and use it in GitHub Desktop.
Image Metadata Extraction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# This doesn't use any Python libraries for extracting image metadata. Instead, | |
# it relies on the very handy exiftool utility and parsing the results. | |
# exiftool: http://www.sno.phy.queensu.ca/~phil/exiftool/ | |
from __future__ import unicode_literals | |
import unicodedata | |
import subprocess | |
import csv | |
import os | |
import re | |
from pymongo import MongoClient | |
def normalize_string(my_string): | |
if my_string is None: | |
return '' | |
if type(my_string) != unicode: | |
my_string = my_string.decode('utf-8', errors='ignore') | |
return unicodedata.normalize('NFKD', my_string).encode('ASCII', 'ignore') | |
def extension(filename): | |
return filename.split('.')[-1].lower() | |
def is_image(filename): | |
return extension(filename) in ['jpg', 'jpeg', 'png'] | |
def is_original_wp_image(filename): | |
return not re.match('.*\-\d+x\d+\.jpg', filename) | |
def get_metadata_by_type(filename, metadata_type='EXIF'): | |
metadata = {} | |
p = subprocess.Popen(['exiftool', '-TAG', "-{0}:*".format(metadata_type), filename], stdout=subprocess.PIPE) | |
out, err = map(normalize_string, p.communicate()) | |
output_lines = out.split('\n') | |
output_lines = filter(lambda s: len(s) > 0, output_lines) | |
for line in output_lines: | |
k = metadata_type + ' ' + line.split(':')[0].strip() | |
v = ''.join(line.split(':')[1:]).strip() | |
metadata[k] = v | |
return metadata | |
def get_all_metadata(filename): | |
metadata = {} | |
metadata.update(get_metadata_by_type(filename, 'EXIF')) | |
metadata.update(get_metadata_by_type(filename, 'IPTC')) | |
metadata.update(get_metadata_by_type(filename, 'XMP')) | |
return metadata | |
if __name__ == "__main__": | |
client = MongoClient() | |
db = client.image_metadata | |
wp_uploads = db.wp_uploads | |
for rootdir, dirs, filenames in os.walk('.'): | |
for filename in filenames: | |
if is_image(filename) and is_original_wp_image(filename): | |
parent_folder = rootdir.split('/')[-1] | |
full_path = os.path.abspath(os.path.join(rootdir, filename)) | |
print(full_path) | |
this_metadata = {'path': full_path} | |
this_metadata.update(get_all_metadata(full_path)) | |
wp_uploads.insert(this_metadata) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Put summarize.py in the root folder where your images are and run it there. It will connect to a mongo instance on the localhost by default.