Created
August 19, 2014 19:04
-
-
Save sprklinginfo/e5a9be59ad1ecc5b5c46 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import tempfile | |
import MySQLdb | |
import ConfigParser | |
import sys | |
import os | |
import logging | |
from fcrepo.utils import NS | |
from fcrepo.connection import Connection, FedoraConnectionException | |
from fcrepo.client import FedoraClient | |
from lxml.builder import ElementMaker | |
from . import xacml | |
CONVERT = "/opt/ImageMagick-6.7/bin/convert" | |
class BatchIngester(object): | |
def __init__(self, config, namespace, dry_run=False): | |
""" | |
Create an instance of BatchIngester and establish | |
a connection to the Fedora repository. | |
""" | |
if not namespace.isalnum(): | |
raise Exception("Invalid namespace: {0}".format(namespace)) | |
self.namespace = namespace | |
self.collections = [] | |
self.dry_run = dry_run | |
# Read the configuration file | |
conf = ConfigParser.ConfigParser() | |
conf.read(config) | |
# Connect to the logging database | |
self.db = MySQLdb.connect(host=conf.get("IngestLog", "hostname"), | |
db=conf.get("IngestLog", "database"), | |
user=conf.get("IngestLog", "username"), | |
passwd=conf.get("IngestLog", "password")) | |
self.db.set_character_set("UTF8") | |
# Set up logging | |
logging.basicConfig( | |
format="%(asctime)s (%(levelname)s): %(msg)s", | |
datefmt="%b %d %I:%M:%S %p", | |
filename=conf.get('Logging', 'logfile'), | |
level={'DEBUG': logging.DEBUG, | |
'INFO': logging.INFO, | |
'WARNING': logging.WARNING, | |
'ERROR': logging.ERROR, | |
'CRITICAL': logging.CRITICAL | |
}[conf.get('Logging', 'loglevel')]) | |
# Connect to Fedora | |
self.fedoraUser = unicode(conf.get('Fedora', 'username')) | |
connection = Connection(conf.get('Fedora', 'url'), | |
username=self.fedoraUser, | |
password=conf.get('Fedora', 'password')) | |
self.fedora = FedoraClient(connection) | |
self.config = conf | |
def add_collection(self, collection, label, parent="collection:root", | |
tn="TN.png"): | |
""" | |
Add a collection to fedora if it doesn't already exist. | |
This also sets the current "parent" of all ingested items. | |
""" | |
pid = None | |
if len(collection.split(':')) == 2: | |
pid = collection | |
elif len(collection.split(':')) == 1: | |
if collection.isalnum(): | |
pid = self.namespace + ':' + collection | |
if not pid: | |
raise Exception("Invalid collection name: '{0}'".format(collection)) | |
if self.dry_run: | |
pid = None | |
logging.debug( | |
"Checking existence of parent: '{0}'".format(parent)) | |
self.fedora.getObject(parent) | |
logging.debug( | |
"Checking existence of collection: '{0}'".format(pid)) | |
try: | |
self.fedora.getObject(pid) | |
except FedoraConnectionException, ex: | |
if ex.httpcode in [404]: | |
# Collection does not exist, so create it now. | |
logging.info("Creating object: '{0}'".format(pid)) | |
access = xacml.XACML() | |
access.deny('deny-datastreams', | |
resources={'urn:fedora:names:fedora:2.1:resource:datastream:id': [ | |
'POLICY', 'RELS-EXT']}, | |
actions={'urn:fedora:names:fedora:2.1:action:id': [ | |
'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']}) | |
access.deny('deny-apim', | |
actions={"urn:fedora:names:fedora:2.1:action:api": [ | |
"urn:fedora:names:fedora:2.1:action:api-m"]}, | |
groups=["librarian"]) | |
access.deny('deny-apia', | |
actions={ | |
'urn:fedora:names:fedora:2.1:action:api': [ | |
'urn:fedora:names:fedora:2.1:action:api-a'], | |
'urn:fedora:names:fedora:2.1:action:id': [ | |
'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']}, | |
groups=['authenticated user', 'librarian', 'administrator']) | |
access.permit('permit-everything-else') | |
try: | |
if pid is not None: | |
obj = self.fedora.createObject(pid, label=label) | |
obj.ownerId = self.fedoraUser | |
obj.addDataStream('POLICY', access.policy(), | |
checksumType=u'DISABLED', | |
label=u'XACML Policy', | |
logMessage=u'Added POLICY datastream') | |
obj.addDataStream('RELS-EXT', | |
label=u'Fedora object-to-object relationships', | |
checksumType=u'DISABLED') | |
ds = obj['RELS-EXT'] | |
ds[NS.fedora.isMemberOfCollection].append({ | |
'value': u'info:fedora/{0}'.format(parent), | |
'type': u'uri' | |
}) | |
ds[NS.fedora_model.hasModel].append({ | |
'value': u'info:fedora/my:collectionCModel', | |
'type': u'uri' | |
}) | |
ds.setContent() | |
except: | |
pass | |
if os.path.exists(tn): | |
self.add_png(pid, tn, "TN") | |
else: | |
logging.warning( | |
"Missing TN datastream for collection object") | |
else: | |
logging.error( | |
"Could not access collection object: {0}".format(pid)) | |
# Set the current collection | |
self.collections.append(pid) | |
def processMETS(self, filename): | |
""" | |
Process a METS file if it hasn't already been ingested. | |
Update the database after processing. | |
""" | |
cursor = self.db.cursor() | |
cursor.execute(""" | |
SELECT pid, indexed | |
FROM log | |
WHERE filename=%s""", (filename)) | |
if cursor.fetchone(): | |
logging.info("{0} has already been processed".format(filename)) | |
return 0 | |
else: | |
self.derivatives = [] | |
pid = self.ingest(filename) | |
if pid is not None: | |
cursor.execute(""" | |
INSERT INTO log (pid, filename, indexed) | |
VALUES (%s, %s, NOW())""", (pid, filename)) | |
for d in self.derivatives: | |
cursor.execute(""" | |
INSERT INTO deriv (parent, pid) | |
VALUES (%s, %s)""", (pid, d)) | |
def add_rels_ext(self, pid, models=[]): | |
""" | |
Add the RELS_EXT datastream. | |
""" | |
logging.debug("Adding RELS_EXT datastream") | |
if pid is not None: | |
obj = self.fedora.getObject(pid) | |
obj.addDataStream('RELS-EXT', | |
checksumType=u'DISABLED', | |
label=u"Fedora object-to-object relationships") | |
ds = obj['RELS-EXT'] | |
for c in self.collections: | |
ds[NS.fedora.isMemberOfCollection].append({ | |
'value': u'info:fedora/{0}'.format(c), | |
'type': u'uri' | |
}) | |
for m in models: | |
ds[NS.fedora_model.hasModel].append({ | |
'value': u'info:fedora/{0}'.format(m), | |
'type': u'uri' | |
}) | |
ds.setContent() | |
def add_master(self, pid, filename, mimeType): | |
""" | |
Add the Master file as a datastream. | |
""" | |
logging.debug("Adding Master datastream") | |
if pid is not None: | |
obj = self.fedora.getObject(pid) | |
obj.addDataStream('OBJ', 'tempData', label=u'Archival Image', | |
checksumType=u'MD5', | |
mimeType=mimeType, controlGroup=u'M', | |
logMessage=u'Added Master image') | |
with open(filename, 'rb') as fp: | |
obj['OBJ'].setContent(fp) | |
def add_exif(self, pid, filename): | |
""" | |
Extract the EXIF metadata and add it to Fedora. | |
""" | |
logging.debug("Extracting EXIF metadata") | |
p = subprocess.Popen(["exiftool", "-X", filename], | |
stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
out, err = p.communicate() | |
if err: | |
logging.debug(err) | |
if pid is not None: | |
obj = self.fedora.getObject(pid) | |
obj.addDataStream('EXIF', out, label=u'EXIF Metadata', | |
checksumType=u'DISABLED', | |
mimeType=u'text/xml', controlGroup=u'X', | |
logMessage=u'Added EXIF Metadata') | |
def add_jp2(self, pid, source): | |
""" | |
Convert the file to JPEG 2000 format and add it | |
to fedora as a datastream. | |
""" | |
filename = None | |
with tempfile.NamedTemporaryFile(delete=False, suffix='.jp2') as fp: | |
filename = fp.name | |
logging.debug("Converting to jp2; adding to fedora") | |
p = subprocess.Popen([CONVERT, source, "-define", "jp2:tilewidth=256", | |
"-define", "jp2:tileheight=256", filename], | |
stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
out, err = p.communicate() | |
if err: | |
logging.debug(err) | |
if pid is not None: | |
obj = self.fedora.getObject(pid) | |
obj.addDataStream('JP2', 'tempData', label=u'Pyramid JPEG2000 Image', | |
checksumType=u'MD5', | |
mimeType=u'image/jp2', controlGroup=u'M', | |
logMessage=u'Added JPEG 2000 Image') | |
with open(filename, 'rb') as fp: | |
obj['JP2'].setContent(fp) | |
os.unlink(filename) | |
def add_png(self, pid, source, dsid, delete=True): | |
""" | |
Create an unscaled png image and add it to fedora as a datastream. | |
""" | |
logging.debug("Creating {0} png".format(dsid)) | |
filename = None | |
with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as fp: | |
filename = fp.name | |
p = subprocess.Popen([CONVERT, source, filename], | |
stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
out, err = p.communicate() | |
if err: | |
logging.debug(err) | |
if pid is not None: | |
obj = self.fedora.getObject(pid) | |
obj.addDataStream(dsid, 'tempData', | |
checksumType=u'MD5', | |
label=u"{0} png image".format(dsid), | |
mimeType=u'image/png', controlGroup=u'M') | |
with open(filename, 'rb') as fp: | |
obj[dsid].setContent(fp) | |
if delete: | |
os.unlink(filename) | |
else: | |
return filename | |
def add_jpeg(self, pid, source, dsid, delete=True): | |
""" | |
Create an unscaled jpeg image and add it to fedora as a datastream. | |
""" | |
logging.debug("Creating {0} jpeg".format(dsid)) | |
filename = None | |
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as fp: | |
filename = fp.name | |
p = subprocess.Popen([CONVERT, source, filename], | |
stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
out, err = p.communicate() | |
if err: | |
logging.debug(err) | |
if pid is not None: | |
obj = self.fedora.getObject(pid) | |
obj.addDataStream(dsid, 'tempData', | |
checksumType=u'MD5', | |
label=u"{0} jpeg image".format(dsid), | |
mimeType=u'image/jpeg', controlGroup=u'M') | |
with open(filename, 'rb') as fp: | |
obj[dsid].setContent(fp) | |
if delete: | |
os.unlink(filename) | |
else: | |
return filename | |
def add_scaled_jpeg(self, pid, source, dsid, width, height, delete=True): | |
""" | |
Create a scaled jpeg image and add it to fedora as a datastream. | |
""" | |
logging.debug("Creating {0} jpeg".format(dsid)) | |
filename = None | |
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as fp: | |
filename = fp.name | |
p = subprocess.Popen([CONVERT, source, "-resize", | |
">{0}x{1}".format(width, height), filename], | |
stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
out, err = p.communicate() | |
if err: | |
logging.debug(err) | |
if pid is not None: | |
obj = self.fedora.getObject(pid) | |
obj.addDataStream(dsid, 'tempData', | |
checksumType=u'MD5', | |
label=u"{0} jpeg image".format(dsid), | |
mimeType=u'image/jpeg', controlGroup=u'M') | |
with open(filename, 'rb') as fp: | |
obj[dsid].setContent(fp) | |
if delete: | |
os.unlink(filename) | |
else: | |
return filename | |
def add_cropped_jpeg(self, pid, source, dsid, width, height): | |
""" | |
Create a scaled and cropped jpeg image and add it to fedora. | |
""" | |
logging.debug("Creating {0} jpeg".format(dsid)) | |
filename = None | |
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as fp: | |
filename = fp.name | |
p = subprocess.Popen([CONVERT, source, "-resize", | |
">{0}x{1}^".format(width, height), "-gravity", "North", | |
"-extent", "{0}x{1}".format(width, height), filename], | |
stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
out, err = p.communicate() | |
if err: | |
logging.debug(err) | |
if pid is not None: | |
obj = self.fedora.getObject(pid) | |
obj.addDataStream(dsid, 'tempData', | |
checksumType=u'MD5', | |
label=u'{0} jpeg image'.format(dsid), | |
mimeType=u'image/jpeg', controlGroup=u'M', | |
logMessage=u'Added {0} image'.format(dsid)) | |
with open(filename, 'rb') as fp: | |
obj[dsid].setContent(fp) | |
os.unlink(filename) | |
def ingest(self, filename): | |
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import traceback | |
import time | |
import sys | |
import os | |
import urllib2 | |
import re | |
import tempfile | |
import logging | |
from contextlib import contextmanager | |
from fcrepo.utils import NS | |
from lxml import etree | |
from . import batchingester | |
from . import xacml | |
class ImageIngester(batchingester.BatchIngester): | |
vra2mods = None | |
mods2dc = None | |
def __init__(self, config, namespace, dry_run=False): | |
super(ImageIngester, self).__init__(config, namespace, dry_run) | |
mods_to_dc = self.config.get("XSLT", "mods2dc") | |
self.reset() | |
if os.path.exists(mods_to_dc): | |
self.mods2dc = etree.XSLT(etree.parse(mods_to_dc)) | |
vra_to_mods = self.config.get("XSLT", "vra2mods") | |
if os.path.exists(vra_to_mods): | |
self.vra2mods = etree.XSLT(etree.parse(vra_to_mods)) | |
def extract_metadata(self): | |
""" | |
Extract metadata from the METS record. | |
""" | |
# Extract the MODS datastream | |
mods = self.dom.xpath("//mods:mods", | |
namespaces={'mods': 'http://www.loc.gov/mods/v3'}) | |
self.mods_xml = etree.tostring(mods[0], | |
pretty_print=True) if mods else None | |
# Extract the VRA Core datastream if it exists. | |
vra = self.dom.xpath("//vra:vra", | |
namespaces={'vra': 'http://www.vraweb.org/vracore4.htm'}) | |
self.vra_xml = etree.tostring(vra[0], | |
pretty_print=True) if vra else None | |
# Convert VRA to MODS if it doesn't already exist | |
if self.mods_xml is None and self.vra2mods: | |
self.mods_xml = etree.tostring(self.vra2mods(vra[0]), | |
pretty_print=True) | |
# Extract title and description from MODS | |
if self.mods_xml: | |
mods = etree.fromstring(self.mods_xml) | |
node = mods.xpath("//mods:title/text()", | |
namespaces={'mods': 'http://www.loc.gov/mods/v3'}) | |
self.title = re.sub(r"\s+", " ", node[0].strip()) \ | |
if node else u"Untitled" | |
if len(self.title) > 256: | |
self.title = re.sub(r":.*", "", self.title) | |
if len(self.title) > 256: | |
self.title = re.sub(r";.*", "", self.title) | |
if len(self.title) > 256: | |
self.title = re.sub(r"\s[^\s]*$", "", self.title[:255]) | |
if len(self.title) > 256: | |
self.title = self.title[:250] | |
node = mods.xpath("//mods:abstract/text()", | |
namespaces={'mods': 'http://www.loc.gov/mods/v3'}) | |
self.description = re.sub(r"\s+", " ", node[0].strip()) \ | |
if node else "" | |
if self.mods2dc: | |
self.dc_xml = etree.tostring(self.mods2dc(mods), | |
pretty_print=True) | |
else: | |
logging.warning("No MODS metadata available") | |
# Extract file reference | |
files = self.dom.xpath("//mets:fileSec//mets:file", | |
namespaces={'mets': 'http://www.loc.gov/METS/'}) | |
if len(files) != 1: | |
logging.error("Can only process METS record with a single file!") | |
return 0 | |
file = files[0] | |
self.archive_file = { | |
'mimeType': file.get("MIMETYPE"), | |
'locType': file[0].get("LOCTYPE"), | |
'uri': file[0].get( | |
"{http://www.w3.org/1999/xlink}href") | |
} | |
return 1 | |
def check_files(self): | |
""" | |
Verify that the archive file is accessible. | |
""" | |
if self.archive_file: | |
try: | |
urllib2.urlopen(self.archive_file["uri"]) | |
return 1 | |
except: | |
logging.warning("Could not access uri: {0}".format( | |
self.archive_file["uri"])) | |
if self.dry_run: | |
return 0 | |
else: | |
raise | |
def reset(self): | |
""" | |
Clear all item-based state data. | |
""" | |
self.title = None | |
self.description = None | |
self.archive_file = None | |
self.dom = None | |
self.mods_xml = None | |
self.vra_xml = None | |
self.dc_xml = None | |
def digital_object(self): | |
# Create the fedora object | |
pid = None | |
# Add a POLICY datastream for this object | |
policy = xacml.XACML() | |
policy.deny('deny-datastreams', | |
resources={ | |
'urn:fedora:names:fedora:2.1:resource:datastream:id': [ | |
'POLICY', 'RELS-EXT']}, | |
actions={ | |
'urn:fedora:names:fedora:2.1:action:id': [ | |
'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']}) | |
policy.deny('deny-apim', | |
actions={ | |
'urn:fedora:names:fedora:2.1:action:api': [ | |
'urn:fedora:names:fedora:2.1:action:api-m']}, | |
groups=['permitted', 'group', 'names']) | |
policy.deny('deny-apia', | |
actions={ | |
'urn:fedora:names:fedora:2.1:action:api': [ | |
'urn:fedora:names:fedora:2.1:action:api-a'], | |
'urn:fedora:names:fedora:2.1:action:id': [ | |
'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']}, | |
groups=['permitted', 'group', 'names']) | |
policy.permit('everything-else') | |
if self.dry_run: | |
logging.info("Processing new object") | |
else: | |
pid = self.fedora.getNextPID(unicode(self.namespace)) | |
logging.info("Creating object with PID: {0}".format(pid)) | |
obj = self.fedora.createObject(pid, | |
label=unicode(self.title.encode("ascii", | |
errors="xmlcharrefreplace"))) | |
obj.ownerId = self.fedoraUser | |
self.add_rels_ext(pid, ['my:contentModel1', | |
'my:contentModel2']) | |
obj.addDataStream('POLICY', policy.policy(), | |
label=u'XACML Policy', | |
checksumType=u'DISABLED', | |
logMessage=u'Added POLICY datastream') | |
return pid | |
def vra_object(self, pid): | |
pidVRA = None | |
if pid is not None: | |
pidVRA = self.fedora.getNextPID(unicode(self.namespace)) | |
obj = self.fedora.createObject(pidVRA, | |
label=u"VRA Metadata for {0}".format(pid)) | |
obj.ownerId = self.fedoraUser | |
obj.addDataStream('RELS-EXT', | |
checksumType=u'DISABLED', | |
label=u"RDF Statements about this object") | |
ds = obj['RELS-EXT'] | |
ds[NS.fedora.isMetadataFor].append({ | |
'value': u'info:fedora/{0}'.format(pid), | |
'type': u'uri' | |
}) | |
ds[NS.fedora_model.hasModel].append({ | |
'value': u'info:fedora/my:vraContentModel', | |
'type': u'uri' | |
}) | |
ds.setContent() | |
self.derivatives.append(pidVRA) | |
self.add_vra(pidVRA, pid) | |
return pidVRA | |
def master_object(self, pid, filename): | |
pidObj = None | |
# Add a POLICY datastream for this object | |
policy = xacml.XACML() | |
policy.deny('deny-datastreams', | |
resources={ | |
'urn:fedora:names:fedora:2.1:resource:datastream:id': [ | |
'POLICY', 'RELS-EXT', 'OBJ']}, | |
actions={ | |
'urn:fedora:names:fedora:2.1:action:id': [ | |
'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']}) | |
policy.deny('deny-apim', | |
actions={ | |
'urn:fedora:names:fedora:2.1:action:api': [ | |
'urn:fedora:names:fedora:2.1:action:api-m']}) | |
policy.deny('deny-apia', | |
actions={ | |
'urn:fedora:names:fedora:2.1:action:api': [ | |
'urn:fedora:names:fedora:2.1:action:api-a'], | |
'urn:fedora:names:fedora:2.1:action:id': [ | |
'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']}, | |
groups=['permitted', 'group', 'names']) | |
policy.permit('everything-else') | |
if pid is not None: | |
pidObj = self.fedora.getNextPID(unicode(self.namespace)) | |
obj = self.fedora.createObject(pidObj, | |
label=u"Master Object for {0}".format(pid)) | |
obj.ownerId = self.fedoraUser | |
obj.addDataStream('RELS-EXT', | |
label=u'RDF Statements about this object', | |
checksumType=u'DISABLED') | |
ds = obj['RELS-EXT'] | |
ds[NS.fedora.isPartOf].append({ | |
'value': u'info:fedora/{0}'.format(pid), | |
'type': u'uri' | |
}) | |
ds[NS.fedora_model.hasModel].append({ | |
'value': u'info:fedora/my:archivalContentModel', | |
'type': u'uri' | |
}) | |
ds.setContent() | |
obj.addDataStream('POLICY', policy.policy(), | |
checksumType=u'DISABLED', | |
label=u'XACML Policy', | |
logMessage=u'Added POLICY datastream') | |
self.derivatives.append(pidObj) | |
self.add_master(pidObj, filename, unicode(self.archive_file['mimeType'])) | |
return pidObj | |
def exif_object(self, pid, filename): | |
pidEXIF = None | |
if pid is not None: | |
pidEXIF = self.fedora.getNextPID(unicode(self.namespace)) | |
obj = self.fedora.createObject(pidEXIF, | |
label=u"EXIF Metadata for {0}".format(pid)) | |
obj.ownerId = self.fedoraUser | |
obj.addDataStream('RELS-EXT', | |
checksumType=u'DISABLED', | |
label=u'RDF Statements about this object') | |
ds = obj['RELS-EXT'] | |
ds[NS.fedora.isMetadataFor].append({ | |
'value': u'info:fedora/{0}'.format(pid), | |
'type': u'uri'}) | |
ds[NS.fedora_model.hasModel].append({ | |
'value': u'info:fedora/my:exifContentModel', | |
'type': u'uri'}) | |
ds[NS.fedora.isDerivationOf].append({ | |
'value': u'info:fedora/{0}'.format(pid), | |
'type': u'uri'}) | |
ds.setContent() | |
self.derivatives.append(pidEXIF) | |
self.add_exif(pidEXIF, filename) | |
return pidEXIF | |
def jp2_object(self, pid, original, filename): | |
pidJP2 = None | |
# Add a POLICY datastream for this object | |
policy = xacml.XACML() | |
policy.deny('deny-datastreams', | |
resources={ | |
'urn:fedora:names:fedora:2.1:resource:datastream:id': [ | |
'POLICY', 'RELS-EXT']}, | |
actions={ | |
'urn:fedora:names:fedora:2.1:action:id': [ | |
'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']}) | |
policy.deny('deny-apim', | |
actions={ | |
'urn:fedora:names:fedora:2.1:action:api': [ | |
'urn:fedora:names:fedora:2.1:action:api-m']}, | |
users=['librarian']) | |
policy.deny('deny-apia', | |
actions={ | |
'urn:fedora:names:fedora:2.1:action:api': [ | |
'urn:fedora:names:fedora:2.1:action:api-a'], | |
'urn:fedora:names:fedora:2.1:action:id': [ | |
'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']}, | |
groups=['permitted', 'group', 'names']) | |
policy.permit('everything-else') | |
if pid is not None: | |
pidJP2 = self.fedora.getNextPID(unicode(self.namespace)) | |
obj = self.fedora.createObject(pidJP2, | |
label=u"JP2 Object for {0}".format(pid)) | |
obj.ownerId = self.fedoraUser | |
obj.addDataStream('RELS-EXT', | |
checksumType=u'DISABLED', | |
label=u'RDF Statements about this object') | |
ds = obj['RELS-EXT'] | |
ds[NS.fedora.isPartOf].append({ | |
'value': u'info:fedora/{0}'.format(pid), | |
'type': u'uri' | |
}) | |
ds[NS.fedora.isDerivationOf].append({ | |
'value': u'info:fedora/{0}'.format(original), | |
'type': u'uri' | |
}) | |
ds[NS.fedora_model.hasModel].append({ | |
'value': u'info:fedora/my:jp2ContentModel', | |
'type': u'uri' | |
}) | |
ds.setContent() | |
obj.addDataStream('POLICY', policy.policy(), | |
checksumType=u'DISABLED', | |
label=u'XACML Policy', | |
logMessage=u'Added POLICY datastream') | |
self.derivatives.append(pidJP2) | |
self.add_jp2(pidJP2, filename) | |
return pidJP2 | |
def jpeg_object(self, pid, original, filename): | |
pidJPEG = None | |
# Add a POLICY datastream for this object | |
policy = xacml.XACML() | |
policy.deny('deny-datastreams', | |
resources={ | |
'urn:fedora:names:fedora:2.1:resource:datastream:id': [ | |
'POLICY', 'RELS-EXT']}, | |
actions={ | |
'urn:fedora:names:fedora:2.1:action:id': [ | |
'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']}) | |
policy.deny('deny-apim', | |
actions={ | |
'urn:fedora:names:fedora:2.1:action:api': [ | |
'urn:fedora:names:fedora:2.1:action:api-m']}, | |
users=['librarian']) | |
policy.deny('deny-apia', | |
actions={ | |
'urn:fedora:names:fedora:2.1:action:api': [ | |
'urn:fedora:names:fedora:2.1:action:api-a'], | |
'urn:fedora:names:fedora:2.1:action:id': [ | |
'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']}, | |
groups=['permitted', 'user', 'groups']) | |
policy.permit('everything-else') | |
if pid is not None: | |
pidJPEG = self.fedora.getNextPID(unicode(self.namespace)) | |
obj = self.fedora.createObject(pidJPEG, | |
label=u'JPEG Object for {0}'.format(pid)) | |
obj.ownerId = self.fedoraUser | |
obj.addDataStream('RELS-EXT', | |
checksumType=u'DISABLED', | |
label=u'RDF Statements about this object') | |
ds = obj['RELS-EXT'] | |
ds[NS.fedora.isPartOf].append({ | |
'value': u'info:fedora/{0}'.format(pid), | |
'type': u'uri' | |
}) | |
ds[NS.fedora_model.hasModel].append({ | |
'value': u'info:fedora/my:jpegContentModel', | |
'type': u'uri' | |
}) | |
ds[NS.fedora.isDerivationOf].append({ | |
'value': u'info:fedora/{0}'.format(original), | |
'type': u'uri' | |
}) | |
ds.setContent() | |
obj.addDataStream('POLICY', policy.policy(), | |
checksumType=u'DISABLED', | |
label=u'XACML Policy', | |
logMessage=u'Added POLICY datastream') | |
self.derivatives.append(pidJPEG) | |
self.add_jpeg(pidJPEG, filename, "FULL_SIZE") | |
# Use the medium image for generating the smaller items. | |
medium = self.add_scaled_jpeg(pidJPEG, filename, "MEDIUM_SIZE", | |
640, 480, delete=False) | |
# Add these to the main object! | |
self.add_scaled_jpeg(pid, medium, "SMALL_SIZE", 200, 150) | |
self.add_scaled_jpeg(pid, medium, "TN", 88, 66) | |
self.add_cropped_jpeg(pid, medium, "ICON", 32, 24) | |
os.unlink(medium) | |
return pidJPEG | |
def ingest(self, filename): | |
self.reset() | |
self.dom = etree.parse(filename) | |
if self.extract_metadata(): | |
if self.check_files(): | |
# Get the archive file | |
pid = None | |
pids = [] | |
try: | |
with self.local_copy() as filename: | |
# Create the main object | |
pid = self.digital_object() | |
pids.append(pid) | |
# Create a VRA Core object which | |
# 'isMetadataFor' the main object | |
vra_pid = self.vra_object(pid) | |
pids.append(vra_pid) | |
# Create an object part for the master image | |
master_pid = self.master_object(pid, filename) | |
pids.append(vra_pid) | |
# Create an object for the EXIF metadata | |
# which 'isMetadataFor' the Master file | |
exif_pid = self.exif_object(master_pid, filename) | |
pids.append(exif_pid) | |
# Create a JPEG 2000 object | |
jp2_pid = self.jp2_object(pid, master_pid, filename) | |
pids.append(jp2_pid) | |
# Create a JPEG object | |
jpeg_pid = self.jpeg_object(pid, master_pid, filename) | |
pids.append(jpeg_pid) | |
# Add the MODS datastream to the main digital object. | |
# This is done last so that the VRA object has already | |
# been added when fedoragsearch sees the MODS record added. | |
self.add_mods(pid) | |
except: | |
if pid: | |
logging.error("Error processing PID: {0}".format(pid)) | |
if len(pids): | |
logging.error("May need to clean up the following pids: {0}".format(", ".join(pids))) | |
else: | |
logging.error("Error processing file") | |
traceback.print_exc(6) | |
return pid | |
@contextmanager | |
def local_copy(self): | |
""" | |
Store the master file locally for easy retrieval. | |
""" | |
logging.debug("Saving file locally.") | |
filename = None | |
ext = os.path.splitext(self.archive_file['uri']) | |
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as fp: | |
req = urllib2.urlopen(self.archive_file['uri']) | |
CHUNK = 16 * 1024 | |
while True: | |
chunk = req.read(CHUNK) | |
if not chunk: | |
break | |
fp.write(chunk) | |
filename = fp.name | |
try: | |
yield filename | |
except Exception as e: | |
logging.error(e) | |
finally: | |
os.unlink(filename) | |
def add_mods(self, pid): | |
""" | |
Add the MODS datastream | |
""" | |
if self.mods_xml and pid is not None: | |
NS = {"mods": "http://www.loc.gov/mods/v3"} | |
parser = etree.XMLParser(remove_blank_text=True) | |
doc = etree.fromstring(self.mods_xml, parser) | |
ident = doc.xpath("/mods:mods/mods:identifier[@type='uri']", | |
namespaces=NS) | |
if len(ident): | |
ident[0].text = u"https://my.repository.url/{0}".format(pid) | |
else: | |
identifier = etree.Element(u"{http://www.loc.gov/mods/v3}identifier", type=u"uri") | |
identifier.text = u"https://my.repository.url/{0}".format(pid) | |
ident = doc.xpath("/mods:mods/mods:identifier", namespaces=NS) | |
if len(ident): | |
ident[-1].addnext(identifier) | |
else: | |
doc.append(identifier) | |
loc = doc.xpath("/mods:mods/mods:location", namespaces=NS) | |
if not len(loc): | |
loc = [etree.SubElement(doc, u"{http://www.loc.gov/mods/v3}location")] | |
url = doc.xpath("/mods:mods/mods:location/mods:url[@usage='primary display']", namespaces=NS) | |
if len(url): | |
url[0].text = u"http://my.repository.url/{0}".format(pid) | |
else: | |
url = etree.Element(u"{http://www.loc.gov/mods/v3}url", usage="primary display") | |
url.text = u"http://my.repository.url/{0}".format(pid) | |
loc[0].append(url) | |
self.mods_xml = etree.tostring(doc, pretty_print=True) | |
obj = self.fedora.getObject(pid) | |
obj.addDataStream('MODS', self.mods_xml, | |
label=u'MODS Metadata', mimeType=u'text/xml', | |
controlGroup=u'X', | |
checksumType=u'DISABLED', | |
logMessage=u'Added MODS datastream') | |
def add_vra(self, pid, parentPid): | |
""" | |
Add the VRA Core datastream, if available | |
""" | |
if self.vra_xml: | |
# Add the canonical URL for this object | |
dom = etree.fromstring(self.vra_xml) | |
nodes = dom.xpath("//vra:image", | |
namespaces={"vra": "http://www.vraweb.org/vracore4.htm"}) | |
if len(nodes): | |
nodes[0].set("href", | |
"https://my.repository.url/{0}".format(parentPid)) | |
self.vra_xml = etree.tostring(dom, pretty_print=True) | |
if pid is not None: | |
obj = self.fedora.getObject(pid) | |
obj.addDataStream('VRA', self.vra_xml, | |
label=u'VRA Core Metadata', | |
mimeType=u'text/xml', controlGroup=u'X', | |
checksumType=u'DISABLED', | |
logMessage=u'Added VRA Core Metadata') | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import argparse | |
import logging | |
from image import ImageIngester | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser( | |
description='Ingest objects into fedora.') | |
# Set up the argument parser. | |
parser = argparse.ArgumentParser( | |
description='Ingest objects into fedora.') | |
parser.add_argument("--namespace", required=True, | |
choices=["my", "accepted", "name", "spaces"], | |
help="The namespace to be used by these objects.") | |
parser.add_argument("--collection", required=True, | |
help="The name of the collection") | |
parser.add_argument("--data", required=True, | |
help="The location of the data files to ingest.") | |
parser.add_argument("--parent", default="collection:root", | |
help="This will be the parent of this collection") | |
parser.add_argument("--config", default="my.cfg", | |
help="A configuration file containing " + | |
"the fedora connection information.") | |
parser.add_argument("--collectionTN", default="TN.png", | |
help="The location of a thumbnail image for the collection") | |
parser.add_argument("--dryrun", default=False, action='store_true') | |
args = parser.parse_args() | |
ingester = ImageIngester(args.config, args.namespace, dry_run=args.dryrun) | |
ingester.collection(args.collection, u"My Collection Name", | |
parent=args.parent, tn=args.collectionTN) | |
total = 0 | |
for file in os.listdir(args.data): | |
if file.endswith("mets.xml"): | |
total += 1 | |
i = 1 | |
for file in os.listdir(args.data): | |
if file.endswith("mets.xml"): | |
logging.info("{0}/{1} Processing {2}".format(i, total, os.path.join(args.data, file))) | |
ingester.processMETS(os.path.join(args.data, file)) | |
i += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment