sprklinginfo/batchingester.py

## batchingester.py
import subprocess
import tempfile
import MySQLdb
import ConfigParser
import sys
import os
import logging
from fcrepo.utils import NS
from fcrepo.connection import Connection, FedoraConnectionException
from fcrepo.client import FedoraClient
from lxml.builder import ElementMaker

from . import xacml

CONVERT = "/opt/ImageMagick-6.7/bin/convert"

class BatchIngester(object):

    def __init__(self, config, namespace, dry_run=False):
        """
            Create an instance of BatchIngester and establish
            a connection to the Fedora repository.
        """
        if not namespace.isalnum():
            raise Exception("Invalid namespace: {0}".format(namespace))
        self.namespace = namespace
        self.collections = []
        self.dry_run = dry_run

        # Read the configuration file
        conf = ConfigParser.ConfigParser()
        conf.read(config)

        # Connect to the logging database
        self.db = MySQLdb.connect(host=conf.get("IngestLog", "hostname"),
                               db=conf.get("IngestLog", "database"),
                               user=conf.get("IngestLog", "username"),
                               passwd=conf.get("IngestLog", "password"))
        self.db.set_character_set("UTF8")

        # Set up logging
        logging.basicConfig(
            format="%(asctime)s (%(levelname)s): %(msg)s",
            datefmt="%b %d %I:%M:%S %p",
            filename=conf.get('Logging', 'logfile'),
            level={'DEBUG': logging.DEBUG,
                         'INFO': logging.INFO,
                         'WARNING': logging.WARNING,
                         'ERROR': logging.ERROR,
                         'CRITICAL': logging.CRITICAL
                             }[conf.get('Logging', 'loglevel')])

        # Connect to Fedora
        self.fedoraUser = unicode(conf.get('Fedora', 'username'))
        connection = Connection(conf.get('Fedora', 'url'),
                                username=self.fedoraUser,
                                password=conf.get('Fedora', 'password'))
        self.fedora = FedoraClient(connection)
        self.config = conf

    def add_collection(self, collection, label, parent="collection:root",
            tn="TN.png"):
        """
            Add a collection to fedora if it doesn't already exist.
            This also sets the current "parent" of all ingested items.
        """
        pid = None
        if len(collection.split(':')) == 2:
            pid = collection
        elif len(collection.split(':')) == 1:
            if collection.isalnum():
                pid = self.namespace + ':' + collection
        if not pid:
            raise Exception("Invalid collection name: '{0}'".format(collection))

        if self.dry_run:
            pid = None

        logging.debug(
                "Checking existence of parent: '{0}'".format(parent))

        self.fedora.getObject(parent)

        logging.debug(
                "Checking existence of collection: '{0}'".format(pid))

        try:
            self.fedora.getObject(pid)
        except FedoraConnectionException, ex:
            if ex.httpcode in [404]:
                # Collection does not exist, so create it now.
                logging.info("Creating object: '{0}'".format(pid))

                access = xacml.XACML()
                access.deny('deny-datastreams',
                        resources={'urn:fedora:names:fedora:2.1:resource:datastream:id': [
                            'POLICY', 'RELS-EXT']},
                        actions={'urn:fedora:names:fedora:2.1:action:id': [
                            'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']})
                access.deny('deny-apim',
                        actions={"urn:fedora:names:fedora:2.1:action:api": [
                            "urn:fedora:names:fedora:2.1:action:api-m"]},
                        groups=["librarian"])
                access.deny('deny-apia',
                        actions={
                            'urn:fedora:names:fedora:2.1:action:api': [
                                'urn:fedora:names:fedora:2.1:action:api-a'],
                            'urn:fedora:names:fedora:2.1:action:id': [
                                'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']},
                        groups=['authenticated user', 'librarian', 'administrator'])
                access.permit('permit-everything-else')

                try:
                    if pid is not None:
                        obj = self.fedora.createObject(pid, label=label)
                        obj.ownerId = self.fedoraUser
                        obj.addDataStream('POLICY', access.policy(),
                                checksumType=u'DISABLED',
                                label=u'XACML Policy',
                                logMessage=u'Added POLICY datastream')

                        obj.addDataStream('RELS-EXT',
                                label=u'Fedora object-to-object relationships',
                                checksumType=u'DISABLED')
                        ds = obj['RELS-EXT']
                        ds[NS.fedora.isMemberOfCollection].append({
                                    'value': u'info:fedora/{0}'.format(parent),
                                    'type': u'uri'
                                })
                        ds[NS.fedora_model.hasModel].append({
                                    'value': u'info:fedora/my:collectionCModel',
                                    'type': u'uri'
                                })
                        ds.setContent()
                except:
                    pass

                if os.path.exists(tn):
                    self.add_png(pid, tn, "TN")
                else:
                    logging.warning(
                        "Missing TN datastream for collection object")

            else:
                logging.error(
                    "Could not access collection object: {0}".format(pid))

        # Set the current collection
        self.collections.append(pid)

    def processMETS(self, filename):
        """
            Process a METS file if it hasn't already been ingested.
            Update the database after processing.
        """
        cursor = self.db.cursor()
        cursor.execute("""
                SELECT pid, indexed
                FROM log
                WHERE filename=%s""", (filename))
        if cursor.fetchone():
            logging.info("{0} has already been processed".format(filename))
            return 0
        else:
            self.derivatives = []
            pid = self.ingest(filename)
            if pid is not None:
                cursor.execute("""
                        INSERT INTO log (pid, filename, indexed)
                        VALUES (%s, %s, NOW())""", (pid, filename))
                for d in self.derivatives:
                    cursor.execute("""
                        INSERT INTO deriv (parent, pid)
                        VALUES (%s, %s)""", (pid, d))

    def add_rels_ext(self, pid, models=[]):
        """
            Add the RELS_EXT datastream.
        """
        logging.debug("Adding RELS_EXT datastream")
        if pid is not None:
            obj = self.fedora.getObject(pid)
            obj.addDataStream('RELS-EXT',
                    checksumType=u'DISABLED',
                    label=u"Fedora object-to-object relationships")
            ds = obj['RELS-EXT']
            for c in self.collections:
                ds[NS.fedora.isMemberOfCollection].append({
                        'value': u'info:fedora/{0}'.format(c),
                        'type': u'uri'
                    })
            for m in models:
                ds[NS.fedora_model.hasModel].append({
                        'value': u'info:fedora/{0}'.format(m),
                        'type': u'uri'
                    })
            ds.setContent()

    def add_master(self, pid, filename, mimeType):
        """
            Add the Master file as a datastream.
        """
        logging.debug("Adding Master datastream")
        if pid is not None:
            obj = self.fedora.getObject(pid)
            obj.addDataStream('OBJ', 'tempData', label=u'Archival Image',
                    checksumType=u'MD5',
                    mimeType=mimeType, controlGroup=u'M',
                    logMessage=u'Added Master image')
            with open(filename, 'rb') as fp:
                obj['OBJ'].setContent(fp)

    def add_exif(self, pid, filename):
        """
            Extract the EXIF metadata and add it to Fedora.
        """
        logging.debug("Extracting EXIF metadata")
        p = subprocess.Popen(["exiftool", "-X", filename],
                stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        out, err = p.communicate()
        if err:
            logging.debug(err)

        if pid is not None:
            obj = self.fedora.getObject(pid)

            obj.addDataStream('EXIF', out, label=u'EXIF Metadata',
                    checksumType=u'DISABLED',
                    mimeType=u'text/xml', controlGroup=u'X',
                    logMessage=u'Added EXIF Metadata')

    def add_jp2(self, pid, source):
        """
            Convert the file to JPEG 2000 format and add it
            to fedora as a datastream.
        """
        filename = None
        with tempfile.NamedTemporaryFile(delete=False, suffix='.jp2') as fp:
            filename = fp.name
        logging.debug("Converting to jp2; adding to fedora")
        p = subprocess.Popen([CONVERT, source, "-define", "jp2:tilewidth=256",
                "-define", "jp2:tileheight=256", filename],
                stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        out, err = p.communicate()
        if err:
            logging.debug(err)

        if pid is not None:
            obj = self.fedora.getObject(pid)
            obj.addDataStream('JP2', 'tempData', label=u'Pyramid JPEG2000 Image',
                    checksumType=u'MD5',
                    mimeType=u'image/jp2', controlGroup=u'M',
                    logMessage=u'Added JPEG 2000 Image')
            with open(filename, 'rb') as fp:
                obj['JP2'].setContent(fp)
        os.unlink(filename)

    def add_png(self, pid, source, dsid, delete=True):
        """
            Create an unscaled png image and add it to fedora as a datastream.
        """
        logging.debug("Creating {0} png".format(dsid))
        filename = None
        with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as fp:
            filename = fp.name
        p = subprocess.Popen([CONVERT, source, filename],
                    stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        out, err = p.communicate()
        if err:
            logging.debug(err)

        if pid is not None:
            obj = self.fedora.getObject(pid)
            obj.addDataStream(dsid, 'tempData',
                    checksumType=u'MD5',
                    label=u"{0} png image".format(dsid),
                    mimeType=u'image/png', controlGroup=u'M')
            with open(filename, 'rb') as fp:
                obj[dsid].setContent(fp)
        if delete:
            os.unlink(filename)
        else:
            return filename

    def add_jpeg(self, pid, source, dsid, delete=True):
        """
            Create an unscaled jpeg image and add it to fedora as a datastream.
        """
        logging.debug("Creating {0} jpeg".format(dsid))
        filename = None
        with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as fp:
            filename = fp.name
        p = subprocess.Popen([CONVERT, source, filename],
                    stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        out, err = p.communicate()
        if err:
            logging.debug(err)

        if pid is not None:
            obj = self.fedora.getObject(pid)
            obj.addDataStream(dsid, 'tempData',
                    checksumType=u'MD5',
                    label=u"{0} jpeg image".format(dsid),
                    mimeType=u'image/jpeg', controlGroup=u'M')
            with open(filename, 'rb') as fp:
                obj[dsid].setContent(fp)
        if delete:
            os.unlink(filename)
        else:
            return filename

    def add_scaled_jpeg(self, pid, source, dsid, width, height, delete=True):
        """
            Create a scaled jpeg image and add it to fedora as a datastream.
        """
        logging.debug("Creating {0} jpeg".format(dsid))
        filename = None
        with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as fp:
            filename = fp.name
        p = subprocess.Popen([CONVERT, source, "-resize",
                ">{0}x{1}".format(width, height), filename],
                stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        out, err = p.communicate()
        if err:
            logging.debug(err)

        if pid is not None:
            obj = self.fedora.getObject(pid)
            obj.addDataStream(dsid, 'tempData',
                    checksumType=u'MD5',
                    label=u"{0} jpeg image".format(dsid),
                    mimeType=u'image/jpeg', controlGroup=u'M')
            with open(filename, 'rb') as fp:
                obj[dsid].setContent(fp)
        if delete:
            os.unlink(filename)
        else:
            return filename

    def add_cropped_jpeg(self, pid, source, dsid, width, height):
        """
            Create a scaled and cropped jpeg image and add it to fedora.
        """
        logging.debug("Creating {0} jpeg".format(dsid))
        filename = None
        with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as fp:
            filename = fp.name
        p = subprocess.Popen([CONVERT, source, "-resize",
                ">{0}x{1}^".format(width, height), "-gravity", "North",
                "-extent", "{0}x{1}".format(width, height), filename],
                stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        out, err = p.communicate()
        if err:
            logging.debug(err)

        if pid is not None:
            obj = self.fedora.getObject(pid)
            obj.addDataStream(dsid, 'tempData',
                    checksumType=u'MD5',
                    label=u'{0} jpeg image'.format(dsid),
                    mimeType=u'image/jpeg', controlGroup=u'M',
                    logMessage=u'Added {0} image'.format(dsid))
            with open(filename, 'rb') as fp:
                obj[dsid].setContent(fp)
        os.unlink(filename)

    def ingest(self, filename):
        pass

## image.py
import traceback
import time
import sys
import os
import urllib2
import re
import tempfile
import logging
from contextlib import contextmanager

from fcrepo.utils import NS
from lxml import etree

from . import batchingester
from . import xacml


class ImageIngester(batchingester.BatchIngester):

    vra2mods = None
    mods2dc = None

    def __init__(self, config, namespace, dry_run=False):
        super(ImageIngester, self).__init__(config, namespace, dry_run)
        mods_to_dc = self.config.get("XSLT", "mods2dc")
        self.reset()
        if os.path.exists(mods_to_dc):
            self.mods2dc = etree.XSLT(etree.parse(mods_to_dc))

        vra_to_mods = self.config.get("XSLT", "vra2mods")
        if os.path.exists(vra_to_mods):
            self.vra2mods = etree.XSLT(etree.parse(vra_to_mods))

    def extract_metadata(self):
        """
            Extract metadata from the METS record.
        """
        # Extract the MODS datastream
        mods = self.dom.xpath("//mods:mods",
                namespaces={'mods': 'http://www.loc.gov/mods/v3'})
        self.mods_xml = etree.tostring(mods[0],
                pretty_print=True) if mods else None

        # Extract the VRA Core datastream if it exists.
        vra = self.dom.xpath("//vra:vra",
                namespaces={'vra': 'http://www.vraweb.org/vracore4.htm'})
        self.vra_xml = etree.tostring(vra[0],
                pretty_print=True) if vra else None

        # Convert VRA to MODS if it doesn't already exist
        if self.mods_xml is None and self.vra2mods:
            self.mods_xml = etree.tostring(self.vra2mods(vra[0]),
                    pretty_print=True)

        # Extract title and description from MODS
        if self.mods_xml:
            mods = etree.fromstring(self.mods_xml)
            node = mods.xpath("//mods:title/text()",
                    namespaces={'mods': 'http://www.loc.gov/mods/v3'})
            self.title = re.sub(r"\s+", " ", node[0].strip()) \
                    if node else u"Untitled"

            if len(self.title) > 256:
                self.title = re.sub(r":.*", "", self.title)
            if len(self.title) > 256:
                self.title = re.sub(r";.*", "", self.title)
            if len(self.title) > 256:
                self.title = re.sub(r"\s[^\s]*$", "", self.title[:255])
            if len(self.title) > 256:
                self.title = self.title[:250]

            node = mods.xpath("//mods:abstract/text()",
                    namespaces={'mods': 'http://www.loc.gov/mods/v3'})
            self.description = re.sub(r"\s+", " ", node[0].strip()) \
                    if node else ""

            if self.mods2dc:
                self.dc_xml = etree.tostring(self.mods2dc(mods),
                        pretty_print=True)

        else:
            logging.warning("No MODS metadata available")

        # Extract file reference
        files = self.dom.xpath("//mets:fileSec//mets:file",
                namespaces={'mets': 'http://www.loc.gov/METS/'})
        if len(files) != 1:
            logging.error("Can only process METS record with a single file!")
            return 0

        file = files[0]
        self.archive_file = {
                'mimeType': file.get("MIMETYPE"),
                'locType': file[0].get("LOCTYPE"),
                'uri': file[0].get(
                    "{http://www.w3.org/1999/xlink}href")
                }
        return 1

    def check_files(self):
        """
            Verify that the archive file is accessible.
        """
        if self.archive_file:
            try:
                urllib2.urlopen(self.archive_file["uri"])
                return 1
            except:
                logging.warning("Could not access uri: {0}".format(
                        self.archive_file["uri"]))
                if self.dry_run:
                    return 0
                else:
                    raise

    def reset(self):
        """
            Clear all item-based state data.
        """
        self.title = None
        self.description = None
        self.archive_file = None
        self.dom = None
        self.mods_xml = None
        self.vra_xml = None
        self.dc_xml = None

    def digital_object(self):
        # Create the fedora object
        pid = None

        # Add a POLICY datastream for this object
        policy = xacml.XACML()
        policy.deny('deny-datastreams',
                resources={
                    'urn:fedora:names:fedora:2.1:resource:datastream:id': [
                        'POLICY', 'RELS-EXT']},
                actions={
                    'urn:fedora:names:fedora:2.1:action:id': [
                        'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']})
        policy.deny('deny-apim',
                actions={
                    'urn:fedora:names:fedora:2.1:action:api': [
                        'urn:fedora:names:fedora:2.1:action:api-m']},
                groups=['permitted', 'group', 'names'])
        policy.deny('deny-apia',
                actions={
                    'urn:fedora:names:fedora:2.1:action:api': [
                        'urn:fedora:names:fedora:2.1:action:api-a'],
                    'urn:fedora:names:fedora:2.1:action:id': [
                        'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']},
                groups=['permitted', 'group', 'names'])
        policy.permit('everything-else')

        if self.dry_run:
            logging.info("Processing new object")
        else:
            pid = self.fedora.getNextPID(unicode(self.namespace))
            logging.info("Creating object with PID: {0}".format(pid))
            obj = self.fedora.createObject(pid,
                    label=unicode(self.title.encode("ascii",
                            errors="xmlcharrefreplace")))
            obj.ownerId = self.fedoraUser

            self.add_rels_ext(pid, ['my:contentModel1',
                                    'my:contentModel2'])

            obj.addDataStream('POLICY', policy.policy(),
                    label=u'XACML Policy',
                    checksumType=u'DISABLED',
                    logMessage=u'Added POLICY datastream')
        return pid

    def vra_object(self, pid):
        pidVRA = None
        if pid is not None:
            pidVRA = self.fedora.getNextPID(unicode(self.namespace))
            obj = self.fedora.createObject(pidVRA,
                        label=u"VRA Metadata for {0}".format(pid))
            obj.ownerId = self.fedoraUser
            obj.addDataStream('RELS-EXT',
                    checksumType=u'DISABLED',
                    label=u"RDF Statements about this object")
            ds = obj['RELS-EXT']
            ds[NS.fedora.isMetadataFor].append({
                    'value': u'info:fedora/{0}'.format(pid),
                    'type': u'uri'
                })
            ds[NS.fedora_model.hasModel].append({
                    'value': u'info:fedora/my:vraContentModel',
                    'type': u'uri'
                    })
            ds.setContent()
            self.derivatives.append(pidVRA)

        self.add_vra(pidVRA, pid)
        return pidVRA

    def master_object(self, pid, filename):
        pidObj = None

        # Add a POLICY datastream for this object
        policy = xacml.XACML()
        policy.deny('deny-datastreams',
                resources={
                    'urn:fedora:names:fedora:2.1:resource:datastream:id': [
                        'POLICY', 'RELS-EXT', 'OBJ']},
                actions={
                    'urn:fedora:names:fedora:2.1:action:id': [
                        'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']})
        policy.deny('deny-apim',
                actions={
                    'urn:fedora:names:fedora:2.1:action:api': [
                        'urn:fedora:names:fedora:2.1:action:api-m']})
        policy.deny('deny-apia',
                actions={
                    'urn:fedora:names:fedora:2.1:action:api': [
                        'urn:fedora:names:fedora:2.1:action:api-a'],
                    'urn:fedora:names:fedora:2.1:action:id': [
                        'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']},
                groups=['permitted', 'group', 'names'])
        policy.permit('everything-else')

        if pid is not None:
            pidObj = self.fedora.getNextPID(unicode(self.namespace))
            obj = self.fedora.createObject(pidObj,
                    label=u"Master Object for {0}".format(pid))
            obj.ownerId = self.fedoraUser
            obj.addDataStream('RELS-EXT',
                    label=u'RDF Statements about this object',
                    checksumType=u'DISABLED')
            ds = obj['RELS-EXT']
            ds[NS.fedora.isPartOf].append({
                    'value': u'info:fedora/{0}'.format(pid),
                    'type': u'uri'
                })
            ds[NS.fedora_model.hasModel].append({
                    'value': u'info:fedora/my:archivalContentModel',
                    'type': u'uri'
                })
            ds.setContent()

            obj.addDataStream('POLICY', policy.policy(),
                    checksumType=u'DISABLED',
                    label=u'XACML Policy',
                    logMessage=u'Added POLICY datastream')
            self.derivatives.append(pidObj)

        self.add_master(pidObj, filename, unicode(self.archive_file['mimeType']))

        return pidObj

    def exif_object(self, pid, filename):
        pidEXIF = None
        if pid is not None:
            pidEXIF = self.fedora.getNextPID(unicode(self.namespace))
            obj = self.fedora.createObject(pidEXIF,
                        label=u"EXIF Metadata for {0}".format(pid))
            obj.ownerId = self.fedoraUser
            obj.addDataStream('RELS-EXT',
                    checksumType=u'DISABLED',
                    label=u'RDF Statements about this object')
            ds = obj['RELS-EXT']
            ds[NS.fedora.isMetadataFor].append({
                    'value': u'info:fedora/{0}'.format(pid),
                    'type': u'uri'})
            ds[NS.fedora_model.hasModel].append({
                    'value': u'info:fedora/my:exifContentModel',
                    'type': u'uri'})
            ds[NS.fedora.isDerivationOf].append({
                    'value': u'info:fedora/{0}'.format(pid),
                    'type': u'uri'})
            ds.setContent()

            self.derivatives.append(pidEXIF)

        self.add_exif(pidEXIF, filename)
        return pidEXIF

    def jp2_object(self, pid, original, filename):
        pidJP2 = None

        # Add a POLICY datastream for this object
        policy = xacml.XACML()
        policy.deny('deny-datastreams',
                resources={
                    'urn:fedora:names:fedora:2.1:resource:datastream:id': [
                        'POLICY', 'RELS-EXT']},
                actions={
                    'urn:fedora:names:fedora:2.1:action:id': [
                        'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']})
        policy.deny('deny-apim',
                actions={
                    'urn:fedora:names:fedora:2.1:action:api': [
                        'urn:fedora:names:fedora:2.1:action:api-m']},
                users=['librarian'])
        policy.deny('deny-apia',
                actions={
                    'urn:fedora:names:fedora:2.1:action:api': [
                        'urn:fedora:names:fedora:2.1:action:api-a'],
                    'urn:fedora:names:fedora:2.1:action:id': [
                        'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']},
                groups=['permitted', 'group', 'names'])
        policy.permit('everything-else')

        if pid is not None:
            pidJP2 = self.fedora.getNextPID(unicode(self.namespace))
            obj = self.fedora.createObject(pidJP2,
                        label=u"JP2 Object for {0}".format(pid))
            obj.ownerId = self.fedoraUser
            obj.addDataStream('RELS-EXT',
                    checksumType=u'DISABLED',
                    label=u'RDF Statements about this object')
            ds = obj['RELS-EXT']
            ds[NS.fedora.isPartOf].append({
                    'value': u'info:fedora/{0}'.format(pid),
                    'type': u'uri'
                })
            ds[NS.fedora.isDerivationOf].append({
                    'value': u'info:fedora/{0}'.format(original),
                    'type': u'uri'
                })
            ds[NS.fedora_model.hasModel].append({
                    'value': u'info:fedora/my:jp2ContentModel',
                    'type': u'uri'
                })
            ds.setContent()
            obj.addDataStream('POLICY', policy.policy(),
                    checksumType=u'DISABLED',
                    label=u'XACML Policy',
                    logMessage=u'Added POLICY datastream')
            self.derivatives.append(pidJP2)

        self.add_jp2(pidJP2, filename)
        return pidJP2

    def jpeg_object(self, pid, original, filename):
        pidJPEG = None

        # Add a POLICY datastream for this object
        policy = xacml.XACML()
        policy.deny('deny-datastreams',
                resources={
                    'urn:fedora:names:fedora:2.1:resource:datastream:id': [
                        'POLICY', 'RELS-EXT']},
                actions={
                    'urn:fedora:names:fedora:2.1:action:id': [
                        'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']})
        policy.deny('deny-apim',
                actions={
                    'urn:fedora:names:fedora:2.1:action:api': [
                        'urn:fedora:names:fedora:2.1:action:api-m']},
                users=['librarian'])
        policy.deny('deny-apia',
                actions={
                    'urn:fedora:names:fedora:2.1:action:api': [
                        'urn:fedora:names:fedora:2.1:action:api-a'],
                    'urn:fedora:names:fedora:2.1:action:id': [
                        'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']},
                groups=['permitted', 'user', 'groups'])
        policy.permit('everything-else')

        if pid is not None:
            pidJPEG = self.fedora.getNextPID(unicode(self.namespace))
            obj = self.fedora.createObject(pidJPEG,
                        label=u'JPEG Object for {0}'.format(pid))
            obj.ownerId = self.fedoraUser
            obj.addDataStream('RELS-EXT',
                    checksumType=u'DISABLED',
                    label=u'RDF Statements about this object')
            ds = obj['RELS-EXT']
            ds[NS.fedora.isPartOf].append({
                    'value': u'info:fedora/{0}'.format(pid),
                    'type': u'uri'
                })
            ds[NS.fedora_model.hasModel].append({
                    'value': u'info:fedora/my:jpegContentModel',
                    'type': u'uri'
                })
            ds[NS.fedora.isDerivationOf].append({
                    'value': u'info:fedora/{0}'.format(original),
                    'type': u'uri'
                })
            ds.setContent()

            obj.addDataStream('POLICY', policy.policy(),
                    checksumType=u'DISABLED',
                    label=u'XACML Policy',
                    logMessage=u'Added POLICY datastream')

            self.derivatives.append(pidJPEG)

        self.add_jpeg(pidJPEG, filename, "FULL_SIZE")

        # Use the medium image for generating the smaller items.
        medium = self.add_scaled_jpeg(pidJPEG, filename, "MEDIUM_SIZE",
                640, 480, delete=False)

        # Add these to the main object!
        self.add_scaled_jpeg(pid, medium, "SMALL_SIZE", 200, 150)
        self.add_scaled_jpeg(pid, medium, "TN", 88, 66)
        self.add_cropped_jpeg(pid, medium, "ICON", 32, 24)

        os.unlink(medium)
        return pidJPEG


    def ingest(self, filename):
        self.reset()
        self.dom = etree.parse(filename)

        if self.extract_metadata():
            if self.check_files():
                # Get the archive file
                pid = None
                pids = []
                try:
                    with self.local_copy() as filename:

                        # Create the main object
                        pid = self.digital_object()
                        pids.append(pid)

                        # Create a VRA Core object which
                        # 'isMetadataFor' the main object
                        vra_pid = self.vra_object(pid)
                        pids.append(vra_pid)

                         # Create an object part for the master image
                        master_pid = self.master_object(pid, filename)
                        pids.append(vra_pid)

                        # Create an object for the EXIF metadata
                        # which 'isMetadataFor' the Master file
                        exif_pid = self.exif_object(master_pid, filename)
                        pids.append(exif_pid)

                       # Create a JPEG 2000 object
                        jp2_pid = self.jp2_object(pid, master_pid, filename)
                        pids.append(jp2_pid)

                        # Create a JPEG object
                        jpeg_pid = self.jpeg_object(pid, master_pid, filename)
                        pids.append(jpeg_pid)

                        # Add the MODS datastream to the main digital object.
                        # This is done last so that the VRA object has already
                        # been added when fedoragsearch sees the MODS record added.
                        self.add_mods(pid)

                except:
                    if pid:
                        logging.error("Error processing PID: {0}".format(pid))
                        if len(pids):
                            logging.error("May need to clean up the following pids: {0}".format(", ".join(pids)))
                    else:
                        logging.error("Error processing file")
                    traceback.print_exc(6)

                return pid

    @contextmanager
    def local_copy(self):
        """
            Store the master file locally for easy retrieval.
        """
        logging.debug("Saving file locally.")
        filename = None
        ext = os.path.splitext(self.archive_file['uri'])
        with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as fp:
            req = urllib2.urlopen(self.archive_file['uri'])
            CHUNK = 16 * 1024
            while True:
                chunk = req.read(CHUNK)
                if not chunk:
                    break
                fp.write(chunk)
            filename = fp.name
        try:
            yield filename
        except Exception as e:
            logging.error(e)
        finally:
            os.unlink(filename)

    def add_mods(self, pid):
        """
            Add the MODS datastream
        """
        if self.mods_xml and pid is not None:
            NS = {"mods": "http://www.loc.gov/mods/v3"}
            parser = etree.XMLParser(remove_blank_text=True)
            doc = etree.fromstring(self.mods_xml, parser)

            ident = doc.xpath("/mods:mods/mods:identifier[@type='uri']",
                        namespaces=NS)
            if len(ident):
                ident[0].text = u"https://my.repository.url/{0}".format(pid)
            else:
                identifier = etree.Element(u"{http://www.loc.gov/mods/v3}identifier", type=u"uri")
                identifier.text = u"https://my.repository.url/{0}".format(pid)
                ident = doc.xpath("/mods:mods/mods:identifier", namespaces=NS)
                if len(ident):
                    ident[-1].addnext(identifier)
                else:
                    doc.append(identifier)

            loc = doc.xpath("/mods:mods/mods:location", namespaces=NS)
            if not len(loc):
                loc = [etree.SubElement(doc, u"{http://www.loc.gov/mods/v3}location")]

            url = doc.xpath("/mods:mods/mods:location/mods:url[@usage='primary display']", namespaces=NS)
            if len(url):
                url[0].text = u"http://my.repository.url/{0}".format(pid)
            else:
                url = etree.Element(u"{http://www.loc.gov/mods/v3}url", usage="primary display")
                url.text = u"http://my.repository.url/{0}".format(pid)
                loc[0].append(url)

            self.mods_xml = etree.tostring(doc, pretty_print=True)

            obj = self.fedora.getObject(pid)
            obj.addDataStream('MODS', self.mods_xml,
                    label=u'MODS Metadata', mimeType=u'text/xml',
                    controlGroup=u'X',
                    checksumType=u'DISABLED',
                    logMessage=u'Added MODS datastream')

    def add_vra(self, pid, parentPid):
        """
            Add the VRA Core datastream, if available
        """
        if self.vra_xml:
            # Add the canonical URL for this object
            dom = etree.fromstring(self.vra_xml)
            nodes = dom.xpath("//vra:image",
                    namespaces={"vra": "http://www.vraweb.org/vracore4.htm"})
            if len(nodes):
                nodes[0].set("href",
                        "https://my.repository.url/{0}".format(parentPid))
                self.vra_xml = etree.tostring(dom, pretty_print=True)

            if pid is not None:
                obj = self.fedora.getObject(pid)
                obj.addDataStream('VRA', self.vra_xml,
                        label=u'VRA Core Metadata',
                        mimeType=u'text/xml', controlGroup=u'X',
                        checksumType=u'DISABLED',
                        logMessage=u'Added VRA Core Metadata')


## ingest.py
import os
import argparse
import logging
from image import ImageIngester

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
            description='Ingest objects into fedora.')

    # Set up the argument parser.
    parser = argparse.ArgumentParser(
                description='Ingest objects into fedora.')
    parser.add_argument("--namespace", required=True,
                choices=["my", "accepted", "name", "spaces"],
                help="The namespace to be used by these objects.")
    parser.add_argument("--collection", required=True,
                help="The name of the collection")
    parser.add_argument("--data", required=True,
                help="The location of the data files to ingest.")
    parser.add_argument("--parent", default="collection:root",
                help="This will be the parent of this collection")
    parser.add_argument("--config", default="my.cfg",
                help="A configuration file containing " +
                     "the fedora connection information.")
    parser.add_argument("--collectionTN", default="TN.png",
                help="The location of a thumbnail image for the collection")
    parser.add_argument("--dryrun", default=False, action='store_true')

    args = parser.parse_args()


ingester = ImageIngester(args.config, args.namespace, dry_run=args.dryrun)
ingester.collection(args.collection, u"My Collection Name",
        parent=args.parent, tn=args.collectionTN)

total = 0
for file in os.listdir(args.data):
    if file.endswith("mets.xml"):
        total += 1

i = 1
for file in os.listdir(args.data):
    if file.endswith("mets.xml"):
        logging.info("{0}/{1} Processing {2}".format(i, total, os.path.join(args.data, file)))
        ingester.processMETS(os.path.join(args.data, file))
        i += 1
	import subprocess
	import tempfile
	import MySQLdb
	import ConfigParser
	import sys
	import os
	import logging
	from fcrepo.utils import NS
	from fcrepo.connection import Connection, FedoraConnectionException
	from fcrepo.client import FedoraClient
	from lxml.builder import ElementMaker

	from . import xacml

	CONVERT = "/opt/ImageMagick-6.7/bin/convert"

	class BatchIngester(object):

	def __init__(self, config, namespace, dry_run=False):
	"""
	Create an instance of BatchIngester and establish
	a connection to the Fedora repository.
	"""
	if not namespace.isalnum():
	raise Exception("Invalid namespace: {0}".format(namespace))
	self.namespace = namespace
	self.collections = []
	self.dry_run = dry_run

	# Read the configuration file
	conf = ConfigParser.ConfigParser()
	conf.read(config)

	# Connect to the logging database
	self.db = MySQLdb.connect(host=conf.get("IngestLog", "hostname"),
	db=conf.get("IngestLog", "database"),
	user=conf.get("IngestLog", "username"),
	passwd=conf.get("IngestLog", "password"))
	self.db.set_character_set("UTF8")

	# Set up logging
	logging.basicConfig(
	format="%(asctime)s (%(levelname)s): %(msg)s",
	datefmt="%b %d %I:%M:%S %p",
	filename=conf.get('Logging', 'logfile'),
	level={'DEBUG': logging.DEBUG,
	'INFO': logging.INFO,
	'WARNING': logging.WARNING,
	'ERROR': logging.ERROR,
	'CRITICAL': logging.CRITICAL
	}[conf.get('Logging', 'loglevel')])

	# Connect to Fedora
	self.fedoraUser = unicode(conf.get('Fedora', 'username'))
	connection = Connection(conf.get('Fedora', 'url'),
	username=self.fedoraUser,
	password=conf.get('Fedora', 'password'))
	self.fedora = FedoraClient(connection)
	self.config = conf

	def add_collection(self, collection, label, parent="collection:root",
	tn="TN.png"):
	"""
	Add a collection to fedora if it doesn't already exist.
	This also sets the current "parent" of all ingested items.
	"""
	pid = None
	if len(collection.split(':')) == 2:
	pid = collection
	elif len(collection.split(':')) == 1:
	if collection.isalnum():
	pid = self.namespace + ':' + collection
	if not pid:
	raise Exception("Invalid collection name: '{0}'".format(collection))

	if self.dry_run:
	pid = None

	logging.debug(
	"Checking existence of parent: '{0}'".format(parent))

	self.fedora.getObject(parent)

	logging.debug(
	"Checking existence of collection: '{0}'".format(pid))

	try:
	self.fedora.getObject(pid)
	except FedoraConnectionException, ex:
	if ex.httpcode in [404]:
	# Collection does not exist, so create it now.
	logging.info("Creating object: '{0}'".format(pid))

	access = xacml.XACML()
	access.deny('deny-datastreams',
	resources={'urn:fedora:names:fedora:2.1:resource:datastream:id': [
	'POLICY', 'RELS-EXT']},
	actions={'urn:fedora:names:fedora:2.1:action:id': [
	'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']})
	access.deny('deny-apim',
	actions={"urn:fedora:names:fedora:2.1:action:api": [
	"urn:fedora:names:fedora:2.1:action:api-m"]},
	groups=["librarian"])
	access.deny('deny-apia',
	actions={
	'urn:fedora:names:fedora:2.1:action:api': [
	'urn:fedora:names:fedora:2.1:action:api-a'],
	'urn:fedora:names:fedora:2.1:action:id': [
	'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']},
	groups=['authenticated user', 'librarian', 'administrator'])
	access.permit('permit-everything-else')

	try:
	if pid is not None:
	obj = self.fedora.createObject(pid, label=label)
	obj.ownerId = self.fedoraUser
	obj.addDataStream('POLICY', access.policy(),
	checksumType=u'DISABLED',
	label=u'XACML Policy',
	logMessage=u'Added POLICY datastream')

	obj.addDataStream('RELS-EXT',
	label=u'Fedora object-to-object relationships',
	checksumType=u'DISABLED')
	ds = obj['RELS-EXT']
	ds[NS.fedora.isMemberOfCollection].append({
	'value': u'info:fedora/{0}'.format(parent),
	'type': u'uri'
	})
	ds[NS.fedora_model.hasModel].append({
	'value': u'info:fedora/my:collectionCModel',
	'type': u'uri'
	})
	ds.setContent()
	except:
	pass

	if os.path.exists(tn):
	self.add_png(pid, tn, "TN")
	else:
	logging.warning(
	"Missing TN datastream for collection object")

	else:
	logging.error(
	"Could not access collection object: {0}".format(pid))

	# Set the current collection
	self.collections.append(pid)

	def processMETS(self, filename):
	"""
	Process a METS file if it hasn't already been ingested.
	Update the database after processing.
	"""
	cursor = self.db.cursor()
	cursor.execute("""
	SELECT pid, indexed
	FROM log
	WHERE filename=%s""", (filename))
	if cursor.fetchone():
	logging.info("{0} has already been processed".format(filename))
	return 0
	else:
	self.derivatives = []
	pid = self.ingest(filename)
	if pid is not None:
	cursor.execute("""
	INSERT INTO log (pid, filename, indexed)
	VALUES (%s, %s, NOW())""", (pid, filename))
	for d in self.derivatives:
	cursor.execute("""
	INSERT INTO deriv (parent, pid)
	VALUES (%s, %s)""", (pid, d))

	def add_rels_ext(self, pid, models=[]):
	"""
	Add the RELS_EXT datastream.
	"""
	logging.debug("Adding RELS_EXT datastream")
	if pid is not None:
	obj = self.fedora.getObject(pid)
	obj.addDataStream('RELS-EXT',
	checksumType=u'DISABLED',
	label=u"Fedora object-to-object relationships")
	ds = obj['RELS-EXT']
	for c in self.collections:
	ds[NS.fedora.isMemberOfCollection].append({
	'value': u'info:fedora/{0}'.format(c),
	'type': u'uri'
	})
	for m in models:
	ds[NS.fedora_model.hasModel].append({
	'value': u'info:fedora/{0}'.format(m),
	'type': u'uri'
	})
	ds.setContent()

	def add_master(self, pid, filename, mimeType):
	"""
	Add the Master file as a datastream.
	"""
	logging.debug("Adding Master datastream")
	if pid is not None:
	obj = self.fedora.getObject(pid)
	obj.addDataStream('OBJ', 'tempData', label=u'Archival Image',
	checksumType=u'MD5',
	mimeType=mimeType, controlGroup=u'M',
	logMessage=u'Added Master image')
	with open(filename, 'rb') as fp:
	obj['OBJ'].setContent(fp)

	def add_exif(self, pid, filename):
	"""
	Extract the EXIF metadata and add it to Fedora.
	"""
	logging.debug("Extracting EXIF metadata")
	p = subprocess.Popen(["exiftool", "-X", filename],
	stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	out, err = p.communicate()
	if err:
	logging.debug(err)

	if pid is not None:
	obj = self.fedora.getObject(pid)

	obj.addDataStream('EXIF', out, label=u'EXIF Metadata',
	checksumType=u'DISABLED',
	mimeType=u'text/xml', controlGroup=u'X',
	logMessage=u'Added EXIF Metadata')

	def add_jp2(self, pid, source):
	"""
	Convert the file to JPEG 2000 format and add it
	to fedora as a datastream.
	"""
	filename = None
	with tempfile.NamedTemporaryFile(delete=False, suffix='.jp2') as fp:
	filename = fp.name
	logging.debug("Converting to jp2; adding to fedora")
	p = subprocess.Popen([CONVERT, source, "-define", "jp2:tilewidth=256",
	"-define", "jp2:tileheight=256", filename],
	stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	out, err = p.communicate()
	if err:
	logging.debug(err)

	if pid is not None:
	obj = self.fedora.getObject(pid)
	obj.addDataStream('JP2', 'tempData', label=u'Pyramid JPEG2000 Image',
	checksumType=u'MD5',
	mimeType=u'image/jp2', controlGroup=u'M',
	logMessage=u'Added JPEG 2000 Image')
	with open(filename, 'rb') as fp:
	obj['JP2'].setContent(fp)
	os.unlink(filename)

	def add_png(self, pid, source, dsid, delete=True):
	"""
	Create an unscaled png image and add it to fedora as a datastream.
	"""
	logging.debug("Creating {0} png".format(dsid))
	filename = None
	with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as fp:
	filename = fp.name
	p = subprocess.Popen([CONVERT, source, filename],
	stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	out, err = p.communicate()
	if err:
	logging.debug(err)

	if pid is not None:
	obj = self.fedora.getObject(pid)
	obj.addDataStream(dsid, 'tempData',
	checksumType=u'MD5',
	label=u"{0} png image".format(dsid),
	mimeType=u'image/png', controlGroup=u'M')
	with open(filename, 'rb') as fp:
	obj[dsid].setContent(fp)
	if delete:
	os.unlink(filename)
	else:
	return filename

	def add_jpeg(self, pid, source, dsid, delete=True):
	"""
	Create an unscaled jpeg image and add it to fedora as a datastream.
	"""
	logging.debug("Creating {0} jpeg".format(dsid))
	filename = None
	with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as fp:
	filename = fp.name
	p = subprocess.Popen([CONVERT, source, filename],
	stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	out, err = p.communicate()
	if err:
	logging.debug(err)

	if pid is not None:
	obj = self.fedora.getObject(pid)
	obj.addDataStream(dsid, 'tempData',
	checksumType=u'MD5',
	label=u"{0} jpeg image".format(dsid),
	mimeType=u'image/jpeg', controlGroup=u'M')
	with open(filename, 'rb') as fp:
	obj[dsid].setContent(fp)
	if delete:
	os.unlink(filename)
	else:
	return filename

	def add_scaled_jpeg(self, pid, source, dsid, width, height, delete=True):
	"""
	Create a scaled jpeg image and add it to fedora as a datastream.
	"""
	logging.debug("Creating {0} jpeg".format(dsid))
	filename = None
	with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as fp:
	filename = fp.name
	p = subprocess.Popen([CONVERT, source, "-resize",
	">{0}x{1}".format(width, height), filename],
	stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	out, err = p.communicate()
	if err:
	logging.debug(err)

	if pid is not None:
	obj = self.fedora.getObject(pid)
	obj.addDataStream(dsid, 'tempData',
	checksumType=u'MD5',
	label=u"{0} jpeg image".format(dsid),
	mimeType=u'image/jpeg', controlGroup=u'M')
	with open(filename, 'rb') as fp:
	obj[dsid].setContent(fp)
	if delete:
	os.unlink(filename)
	else:
	return filename

	def add_cropped_jpeg(self, pid, source, dsid, width, height):
	"""
	Create a scaled and cropped jpeg image and add it to fedora.
	"""
	logging.debug("Creating {0} jpeg".format(dsid))
	filename = None
	with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as fp:
	filename = fp.name
	p = subprocess.Popen([CONVERT, source, "-resize",
	">{0}x{1}^".format(width, height), "-gravity", "North",
	"-extent", "{0}x{1}".format(width, height), filename],
	stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	out, err = p.communicate()
	if err:
	logging.debug(err)

	if pid is not None:
	obj = self.fedora.getObject(pid)
	obj.addDataStream(dsid, 'tempData',
	checksumType=u'MD5',
	label=u'{0} jpeg image'.format(dsid),
	mimeType=u'image/jpeg', controlGroup=u'M',
	logMessage=u'Added {0} image'.format(dsid))
	with open(filename, 'rb') as fp:
	obj[dsid].setContent(fp)
	os.unlink(filename)

	def ingest(self, filename):
	pass
	import traceback
	import time
	import sys
	import os
	import urllib2
	import re
	import tempfile
	import logging
	from contextlib import contextmanager

	from fcrepo.utils import NS
	from lxml import etree

	from . import batchingester
	from . import xacml


	class ImageIngester(batchingester.BatchIngester):

	vra2mods = None
	mods2dc = None

	def __init__(self, config, namespace, dry_run=False):
	super(ImageIngester, self).__init__(config, namespace, dry_run)
	mods_to_dc = self.config.get("XSLT", "mods2dc")
	self.reset()
	if os.path.exists(mods_to_dc):
	self.mods2dc = etree.XSLT(etree.parse(mods_to_dc))

	vra_to_mods = self.config.get("XSLT", "vra2mods")
	if os.path.exists(vra_to_mods):
	self.vra2mods = etree.XSLT(etree.parse(vra_to_mods))

	def extract_metadata(self):
	"""
	Extract metadata from the METS record.
	"""
	# Extract the MODS datastream
	mods = self.dom.xpath("//mods:mods",
	namespaces={'mods': 'http://www.loc.gov/mods/v3'})
	self.mods_xml = etree.tostring(mods[0],
	pretty_print=True) if mods else None

	# Extract the VRA Core datastream if it exists.
	vra = self.dom.xpath("//vra:vra",
	namespaces={'vra': 'http://www.vraweb.org/vracore4.htm'})
	self.vra_xml = etree.tostring(vra[0],
	pretty_print=True) if vra else None

	# Convert VRA to MODS if it doesn't already exist
	if self.mods_xml is None and self.vra2mods:
	self.mods_xml = etree.tostring(self.vra2mods(vra[0]),
	pretty_print=True)

	# Extract title and description from MODS
	if self.mods_xml:
	mods = etree.fromstring(self.mods_xml)
	node = mods.xpath("//mods:title/text()",
	namespaces={'mods': 'http://www.loc.gov/mods/v3'})
	self.title = re.sub(r"\s+", " ", node[0].strip()) \
	if node else u"Untitled"

	if len(self.title) > 256:
	self.title = re.sub(r":.*", "", self.title)
	if len(self.title) > 256:
	self.title = re.sub(r";.*", "", self.title)
	if len(self.title) > 256:
	self.title = re.sub(r"\s[^\s]*$", "", self.title[:255])
	if len(self.title) > 256:
	self.title = self.title[:250]

	node = mods.xpath("//mods:abstract/text()",
	namespaces={'mods': 'http://www.loc.gov/mods/v3'})
	self.description = re.sub(r"\s+", " ", node[0].strip()) \
	if node else ""

	if self.mods2dc:
	self.dc_xml = etree.tostring(self.mods2dc(mods),
	pretty_print=True)

	else:
	logging.warning("No MODS metadata available")

	# Extract file reference
	files = self.dom.xpath("//mets:fileSec//mets:file",
	namespaces={'mets': 'http://www.loc.gov/METS/'})
	if len(files) != 1:
	logging.error("Can only process METS record with a single file!")
	return 0

	file = files[0]
	self.archive_file = {
	'mimeType': file.get("MIMETYPE"),
	'locType': file[0].get("LOCTYPE"),
	'uri': file[0].get(
	"{http://www.w3.org/1999/xlink}href")
	}
	return 1

	def check_files(self):
	"""
	Verify that the archive file is accessible.
	"""
	if self.archive_file:
	try:
	urllib2.urlopen(self.archive_file["uri"])
	return 1
	except:
	logging.warning("Could not access uri: {0}".format(
	self.archive_file["uri"]))
	if self.dry_run:
	return 0
	else:
	raise

	def reset(self):
	"""
	Clear all item-based state data.
	"""
	self.title = None
	self.description = None
	self.archive_file = None
	self.dom = None
	self.mods_xml = None
	self.vra_xml = None
	self.dc_xml = None

	def digital_object(self):
	# Create the fedora object
	pid = None

	# Add a POLICY datastream for this object
	policy = xacml.XACML()
	policy.deny('deny-datastreams',
	resources={
	'urn:fedora:names:fedora:2.1:resource:datastream:id': [
	'POLICY', 'RELS-EXT']},
	actions={
	'urn:fedora:names:fedora:2.1:action:id': [
	'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']})
	policy.deny('deny-apim',
	actions={
	'urn:fedora:names:fedora:2.1:action:api': [
	'urn:fedora:names:fedora:2.1:action:api-m']},
	groups=['permitted', 'group', 'names'])
	policy.deny('deny-apia',
	actions={
	'urn:fedora:names:fedora:2.1:action:api': [
	'urn:fedora:names:fedora:2.1:action:api-a'],
	'urn:fedora:names:fedora:2.1:action:id': [
	'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']},
	groups=['permitted', 'group', 'names'])
	policy.permit('everything-else')

	if self.dry_run:
	logging.info("Processing new object")
	else:
	pid = self.fedora.getNextPID(unicode(self.namespace))
	logging.info("Creating object with PID: {0}".format(pid))
	obj = self.fedora.createObject(pid,
	label=unicode(self.title.encode("ascii",
	errors="xmlcharrefreplace")))
	obj.ownerId = self.fedoraUser

	self.add_rels_ext(pid, ['my:contentModel1',
	'my:contentModel2'])

	obj.addDataStream('POLICY', policy.policy(),
	label=u'XACML Policy',
	checksumType=u'DISABLED',
	logMessage=u'Added POLICY datastream')
	return pid

	def vra_object(self, pid):
	pidVRA = None
	if pid is not None:
	pidVRA = self.fedora.getNextPID(unicode(self.namespace))
	obj = self.fedora.createObject(pidVRA,
	label=u"VRA Metadata for {0}".format(pid))
	obj.ownerId = self.fedoraUser
	obj.addDataStream('RELS-EXT',
	checksumType=u'DISABLED',
	label=u"RDF Statements about this object")
	ds = obj['RELS-EXT']
	ds[NS.fedora.isMetadataFor].append({
	'value': u'info:fedora/{0}'.format(pid),
	'type': u'uri'
	})
	ds[NS.fedora_model.hasModel].append({
	'value': u'info:fedora/my:vraContentModel',
	'type': u'uri'
	})
	ds.setContent()
	self.derivatives.append(pidVRA)

	self.add_vra(pidVRA, pid)
	return pidVRA

	def master_object(self, pid, filename):
	pidObj = None

	# Add a POLICY datastream for this object
	policy = xacml.XACML()
	policy.deny('deny-datastreams',
	resources={
	'urn:fedora:names:fedora:2.1:resource:datastream:id': [
	'POLICY', 'RELS-EXT', 'OBJ']},
	actions={
	'urn:fedora:names:fedora:2.1:action:id': [
	'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']})
	policy.deny('deny-apim',
	actions={
	'urn:fedora:names:fedora:2.1:action:api': [
	'urn:fedora:names:fedora:2.1:action:api-m']})
	policy.deny('deny-apia',
	actions={
	'urn:fedora:names:fedora:2.1:action:api': [
	'urn:fedora:names:fedora:2.1:action:api-a'],
	'urn:fedora:names:fedora:2.1:action:id': [
	'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']},
	groups=['permitted', 'group', 'names'])
	policy.permit('everything-else')

	if pid is not None:
	pidObj = self.fedora.getNextPID(unicode(self.namespace))
	obj = self.fedora.createObject(pidObj,
	label=u"Master Object for {0}".format(pid))
	obj.ownerId = self.fedoraUser
	obj.addDataStream('RELS-EXT',
	label=u'RDF Statements about this object',
	checksumType=u'DISABLED')
	ds = obj['RELS-EXT']
	ds[NS.fedora.isPartOf].append({
	'value': u'info:fedora/{0}'.format(pid),
	'type': u'uri'
	})
	ds[NS.fedora_model.hasModel].append({
	'value': u'info:fedora/my:archivalContentModel',
	'type': u'uri'
	})
	ds.setContent()

	obj.addDataStream('POLICY', policy.policy(),
	checksumType=u'DISABLED',
	label=u'XACML Policy',
	logMessage=u'Added POLICY datastream')
	self.derivatives.append(pidObj)

	self.add_master(pidObj, filename, unicode(self.archive_file['mimeType']))

	return pidObj

	def exif_object(self, pid, filename):
	pidEXIF = None
	if pid is not None:
	pidEXIF = self.fedora.getNextPID(unicode(self.namespace))
	obj = self.fedora.createObject(pidEXIF,
	label=u"EXIF Metadata for {0}".format(pid))
	obj.ownerId = self.fedoraUser
	obj.addDataStream('RELS-EXT',
	checksumType=u'DISABLED',
	label=u'RDF Statements about this object')
	ds = obj['RELS-EXT']
	ds[NS.fedora.isMetadataFor].append({
	'value': u'info:fedora/{0}'.format(pid),
	'type': u'uri'})
	ds[NS.fedora_model.hasModel].append({
	'value': u'info:fedora/my:exifContentModel',
	'type': u'uri'})
	ds[NS.fedora.isDerivationOf].append({
	'value': u'info:fedora/{0}'.format(pid),
	'type': u'uri'})
	ds.setContent()

	self.derivatives.append(pidEXIF)

	self.add_exif(pidEXIF, filename)
	return pidEXIF

	def jp2_object(self, pid, original, filename):
	pidJP2 = None

	# Add a POLICY datastream for this object
	policy = xacml.XACML()
	policy.deny('deny-datastreams',
	resources={
	'urn:fedora:names:fedora:2.1:resource:datastream:id': [
	'POLICY', 'RELS-EXT']},
	actions={
	'urn:fedora:names:fedora:2.1:action:id': [
	'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']})
	policy.deny('deny-apim',
	actions={
	'urn:fedora:names:fedora:2.1:action:api': [
	'urn:fedora:names:fedora:2.1:action:api-m']},
	users=['librarian'])
	policy.deny('deny-apia',
	actions={
	'urn:fedora:names:fedora:2.1:action:api': [
	'urn:fedora:names:fedora:2.1:action:api-a'],
	'urn:fedora:names:fedora:2.1:action:id': [
	'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']},
	groups=['permitted', 'group', 'names'])
	policy.permit('everything-else')

	if pid is not None:
	pidJP2 = self.fedora.getNextPID(unicode(self.namespace))
	obj = self.fedora.createObject(pidJP2,
	label=u"JP2 Object for {0}".format(pid))
	obj.ownerId = self.fedoraUser
	obj.addDataStream('RELS-EXT',
	checksumType=u'DISABLED',
	label=u'RDF Statements about this object')
	ds = obj['RELS-EXT']
	ds[NS.fedora.isPartOf].append({
	'value': u'info:fedora/{0}'.format(pid),
	'type': u'uri'
	})
	ds[NS.fedora.isDerivationOf].append({
	'value': u'info:fedora/{0}'.format(original),
	'type': u'uri'
	})
	ds[NS.fedora_model.hasModel].append({
	'value': u'info:fedora/my:jp2ContentModel',
	'type': u'uri'
	})
	ds.setContent()
	obj.addDataStream('POLICY', policy.policy(),
	checksumType=u'DISABLED',
	label=u'XACML Policy',
	logMessage=u'Added POLICY datastream')
	self.derivatives.append(pidJP2)

	self.add_jp2(pidJP2, filename)
	return pidJP2

	def jpeg_object(self, pid, original, filename):
	pidJPEG = None

	# Add a POLICY datastream for this object
	policy = xacml.XACML()
	policy.deny('deny-datastreams',
	resources={
	'urn:fedora:names:fedora:2.1:resource:datastream:id': [
	'POLICY', 'RELS-EXT']},
	actions={
	'urn:fedora:names:fedora:2.1:action:id': [
	'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']})
	policy.deny('deny-apim',
	actions={
	'urn:fedora:names:fedora:2.1:action:api': [
	'urn:fedora:names:fedora:2.1:action:api-m']},
	users=['librarian'])
	policy.deny('deny-apia',
	actions={
	'urn:fedora:names:fedora:2.1:action:api': [
	'urn:fedora:names:fedora:2.1:action:api-a'],
	'urn:fedora:names:fedora:2.1:action:id': [
	'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']},
	groups=['permitted', 'user', 'groups'])
	policy.permit('everything-else')

	if pid is not None:
	pidJPEG = self.fedora.getNextPID(unicode(self.namespace))
	obj = self.fedora.createObject(pidJPEG,
	label=u'JPEG Object for {0}'.format(pid))
	obj.ownerId = self.fedoraUser
	obj.addDataStream('RELS-EXT',
	checksumType=u'DISABLED',
	label=u'RDF Statements about this object')
	ds = obj['RELS-EXT']
	ds[NS.fedora.isPartOf].append({
	'value': u'info:fedora/{0}'.format(pid),
	'type': u'uri'
	})
	ds[NS.fedora_model.hasModel].append({
	'value': u'info:fedora/my:jpegContentModel',
	'type': u'uri'
	})
	ds[NS.fedora.isDerivationOf].append({
	'value': u'info:fedora/{0}'.format(original),
	'type': u'uri'
	})
	ds.setContent()

	obj.addDataStream('POLICY', policy.policy(),
	checksumType=u'DISABLED',
	label=u'XACML Policy',
	logMessage=u'Added POLICY datastream')

	self.derivatives.append(pidJPEG)

	self.add_jpeg(pidJPEG, filename, "FULL_SIZE")

	# Use the medium image for generating the smaller items.
	medium = self.add_scaled_jpeg(pidJPEG, filename, "MEDIUM_SIZE",
	640, 480, delete=False)

	# Add these to the main object!
	self.add_scaled_jpeg(pid, medium, "SMALL_SIZE", 200, 150)
	self.add_scaled_jpeg(pid, medium, "TN", 88, 66)
	self.add_cropped_jpeg(pid, medium, "ICON", 32, 24)

	os.unlink(medium)
	return pidJPEG


	def ingest(self, filename):
	self.reset()
	self.dom = etree.parse(filename)

	if self.extract_metadata():
	if self.check_files():
	# Get the archive file
	pid = None
	pids = []
	try:
	with self.local_copy() as filename:

	# Create the main object
	pid = self.digital_object()
	pids.append(pid)

	# Create a VRA Core object which
	# 'isMetadataFor' the main object
	vra_pid = self.vra_object(pid)
	pids.append(vra_pid)

	# Create an object part for the master image
	master_pid = self.master_object(pid, filename)
	pids.append(vra_pid)

	# Create an object for the EXIF metadata
	# which 'isMetadataFor' the Master file
	exif_pid = self.exif_object(master_pid, filename)
	pids.append(exif_pid)

	# Create a JPEG 2000 object
	jp2_pid = self.jp2_object(pid, master_pid, filename)
	pids.append(jp2_pid)

	# Create a JPEG object
	jpeg_pid = self.jpeg_object(pid, master_pid, filename)
	pids.append(jpeg_pid)

	# Add the MODS datastream to the main digital object.
	# This is done last so that the VRA object has already
	# been added when fedoragsearch sees the MODS record added.
	self.add_mods(pid)

	except:
	if pid:
	logging.error("Error processing PID: {0}".format(pid))
	if len(pids):
	logging.error("May need to clean up the following pids: {0}".format(", ".join(pids)))
	else:
	logging.error("Error processing file")
	traceback.print_exc(6)

	return pid

	@contextmanager
	def local_copy(self):
	"""
	Store the master file locally for easy retrieval.
	"""
	logging.debug("Saving file locally.")
	filename = None
	ext = os.path.splitext(self.archive_file['uri'])
	with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as fp:
	req = urllib2.urlopen(self.archive_file['uri'])
	CHUNK = 16 * 1024
	while True:
	chunk = req.read(CHUNK)
	if not chunk:
	break
	fp.write(chunk)
	filename = fp.name
	try:
	yield filename
	except Exception as e:
	logging.error(e)
	finally:
	os.unlink(filename)

	def add_mods(self, pid):
	"""
	Add the MODS datastream
	"""
	if self.mods_xml and pid is not None:
	NS = {"mods": "http://www.loc.gov/mods/v3"}
	parser = etree.XMLParser(remove_blank_text=True)
	doc = etree.fromstring(self.mods_xml, parser)

	ident = doc.xpath("/mods:mods/mods:identifier[@type='uri']",
	namespaces=NS)
	if len(ident):
	ident[0].text = u"https://my.repository.url/{0}".format(pid)
	else:
	identifier = etree.Element(u"{http://www.loc.gov/mods/v3}identifier", type=u"uri")
	identifier.text = u"https://my.repository.url/{0}".format(pid)
	ident = doc.xpath("/mods:mods/mods:identifier", namespaces=NS)
	if len(ident):
	ident[-1].addnext(identifier)
	else:
	doc.append(identifier)

	loc = doc.xpath("/mods:mods/mods:location", namespaces=NS)
	if not len(loc):
	loc = [etree.SubElement(doc, u"{http://www.loc.gov/mods/v3}location")]

	url = doc.xpath("/mods:mods/mods:location/mods:url[@usage='primary display']", namespaces=NS)
	if len(url):
	url[0].text = u"http://my.repository.url/{0}".format(pid)
	else:
	url = etree.Element(u"{http://www.loc.gov/mods/v3}url", usage="primary display")
	url.text = u"http://my.repository.url/{0}".format(pid)
	loc[0].append(url)

	self.mods_xml = etree.tostring(doc, pretty_print=True)

	obj = self.fedora.getObject(pid)
	obj.addDataStream('MODS', self.mods_xml,
	label=u'MODS Metadata', mimeType=u'text/xml',
	controlGroup=u'X',
	checksumType=u'DISABLED',
	logMessage=u'Added MODS datastream')

	def add_vra(self, pid, parentPid):
	"""
	Add the VRA Core datastream, if available
	"""
	if self.vra_xml:
	# Add the canonical URL for this object
	dom = etree.fromstring(self.vra_xml)
	nodes = dom.xpath("//vra:image",
	namespaces={"vra": "http://www.vraweb.org/vracore4.htm"})
	if len(nodes):
	nodes[0].set("href",
	"https://my.repository.url/{0}".format(parentPid))
	self.vra_xml = etree.tostring(dom, pretty_print=True)

	if pid is not None:
	obj = self.fedora.getObject(pid)
	obj.addDataStream('VRA', self.vra_xml,
	label=u'VRA Core Metadata',
	mimeType=u'text/xml', controlGroup=u'X',
	checksumType=u'DISABLED',
	logMessage=u'Added VRA Core Metadata')
	import os
	import argparse
	import logging
	from image import ImageIngester

	if __name__ == '__main__':
	parser = argparse.ArgumentParser(
	description='Ingest objects into fedora.')

	# Set up the argument parser.
	parser = argparse.ArgumentParser(
	description='Ingest objects into fedora.')
	parser.add_argument("--namespace", required=True,
	choices=["my", "accepted", "name", "spaces"],
	help="The namespace to be used by these objects.")
	parser.add_argument("--collection", required=True,
	help="The name of the collection")
	parser.add_argument("--data", required=True,
	help="The location of the data files to ingest.")
	parser.add_argument("--parent", default="collection:root",
	help="This will be the parent of this collection")
	parser.add_argument("--config", default="my.cfg",
	help="A configuration file containing " +
	"the fedora connection information.")
	parser.add_argument("--collectionTN", default="TN.png",
	help="The location of a thumbnail image for the collection")
	parser.add_argument("--dryrun", default=False, action='store_true')

	args = parser.parse_args()


	ingester = ImageIngester(args.config, args.namespace, dry_run=args.dryrun)
	ingester.collection(args.collection, u"My Collection Name",
	parent=args.parent, tn=args.collectionTN)

	total = 0
	for file in os.listdir(args.data):
	if file.endswith("mets.xml"):
	total += 1

	i = 1
	for file in os.listdir(args.data):
	if file.endswith("mets.xml"):
	logging.info("{0}/{1} Processing {2}".format(i, total, os.path.join(args.data, file)))
	ingester.processMETS(os.path.join(args.data, file))
	i += 1