Skip to content

Instantly share code, notes, and snippets.

@sprklinginfo
Created August 19, 2014 19:04
Show Gist options
  • Save sprklinginfo/e5a9be59ad1ecc5b5c46 to your computer and use it in GitHub Desktop.
Save sprklinginfo/e5a9be59ad1ecc5b5c46 to your computer and use it in GitHub Desktop.
import subprocess
import tempfile
import MySQLdb
import ConfigParser
import sys
import os
import logging
from fcrepo.utils import NS
from fcrepo.connection import Connection, FedoraConnectionException
from fcrepo.client import FedoraClient
from lxml.builder import ElementMaker
from . import xacml
CONVERT = "/opt/ImageMagick-6.7/bin/convert"
class BatchIngester(object):
def __init__(self, config, namespace, dry_run=False):
"""
Create an instance of BatchIngester and establish
a connection to the Fedora repository.
"""
if not namespace.isalnum():
raise Exception("Invalid namespace: {0}".format(namespace))
self.namespace = namespace
self.collections = []
self.dry_run = dry_run
# Read the configuration file
conf = ConfigParser.ConfigParser()
conf.read(config)
# Connect to the logging database
self.db = MySQLdb.connect(host=conf.get("IngestLog", "hostname"),
db=conf.get("IngestLog", "database"),
user=conf.get("IngestLog", "username"),
passwd=conf.get("IngestLog", "password"))
self.db.set_character_set("UTF8")
# Set up logging
logging.basicConfig(
format="%(asctime)s (%(levelname)s): %(msg)s",
datefmt="%b %d %I:%M:%S %p",
filename=conf.get('Logging', 'logfile'),
level={'DEBUG': logging.DEBUG,
'INFO': logging.INFO,
'WARNING': logging.WARNING,
'ERROR': logging.ERROR,
'CRITICAL': logging.CRITICAL
}[conf.get('Logging', 'loglevel')])
# Connect to Fedora
self.fedoraUser = unicode(conf.get('Fedora', 'username'))
connection = Connection(conf.get('Fedora', 'url'),
username=self.fedoraUser,
password=conf.get('Fedora', 'password'))
self.fedora = FedoraClient(connection)
self.config = conf
def add_collection(self, collection, label, parent="collection:root",
tn="TN.png"):
"""
Add a collection to fedora if it doesn't already exist.
This also sets the current "parent" of all ingested items.
"""
pid = None
if len(collection.split(':')) == 2:
pid = collection
elif len(collection.split(':')) == 1:
if collection.isalnum():
pid = self.namespace + ':' + collection
if not pid:
raise Exception("Invalid collection name: '{0}'".format(collection))
if self.dry_run:
pid = None
logging.debug(
"Checking existence of parent: '{0}'".format(parent))
self.fedora.getObject(parent)
logging.debug(
"Checking existence of collection: '{0}'".format(pid))
try:
self.fedora.getObject(pid)
except FedoraConnectionException, ex:
if ex.httpcode in [404]:
# Collection does not exist, so create it now.
logging.info("Creating object: '{0}'".format(pid))
access = xacml.XACML()
access.deny('deny-datastreams',
resources={'urn:fedora:names:fedora:2.1:resource:datastream:id': [
'POLICY', 'RELS-EXT']},
actions={'urn:fedora:names:fedora:2.1:action:id': [
'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']})
access.deny('deny-apim',
actions={"urn:fedora:names:fedora:2.1:action:api": [
"urn:fedora:names:fedora:2.1:action:api-m"]},
groups=["librarian"])
access.deny('deny-apia',
actions={
'urn:fedora:names:fedora:2.1:action:api': [
'urn:fedora:names:fedora:2.1:action:api-a'],
'urn:fedora:names:fedora:2.1:action:id': [
'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']},
groups=['authenticated user', 'librarian', 'administrator'])
access.permit('permit-everything-else')
try:
if pid is not None:
obj = self.fedora.createObject(pid, label=label)
obj.ownerId = self.fedoraUser
obj.addDataStream('POLICY', access.policy(),
checksumType=u'DISABLED',
label=u'XACML Policy',
logMessage=u'Added POLICY datastream')
obj.addDataStream('RELS-EXT',
label=u'Fedora object-to-object relationships',
checksumType=u'DISABLED')
ds = obj['RELS-EXT']
ds[NS.fedora.isMemberOfCollection].append({
'value': u'info:fedora/{0}'.format(parent),
'type': u'uri'
})
ds[NS.fedora_model.hasModel].append({
'value': u'info:fedora/my:collectionCModel',
'type': u'uri'
})
ds.setContent()
except:
pass
if os.path.exists(tn):
self.add_png(pid, tn, "TN")
else:
logging.warning(
"Missing TN datastream for collection object")
else:
logging.error(
"Could not access collection object: {0}".format(pid))
# Set the current collection
self.collections.append(pid)
def processMETS(self, filename):
"""
Process a METS file if it hasn't already been ingested.
Update the database after processing.
"""
cursor = self.db.cursor()
cursor.execute("""
SELECT pid, indexed
FROM log
WHERE filename=%s""", (filename))
if cursor.fetchone():
logging.info("{0} has already been processed".format(filename))
return 0
else:
self.derivatives = []
pid = self.ingest(filename)
if pid is not None:
cursor.execute("""
INSERT INTO log (pid, filename, indexed)
VALUES (%s, %s, NOW())""", (pid, filename))
for d in self.derivatives:
cursor.execute("""
INSERT INTO deriv (parent, pid)
VALUES (%s, %s)""", (pid, d))
def add_rels_ext(self, pid, models=[]):
"""
Add the RELS_EXT datastream.
"""
logging.debug("Adding RELS_EXT datastream")
if pid is not None:
obj = self.fedora.getObject(pid)
obj.addDataStream('RELS-EXT',
checksumType=u'DISABLED',
label=u"Fedora object-to-object relationships")
ds = obj['RELS-EXT']
for c in self.collections:
ds[NS.fedora.isMemberOfCollection].append({
'value': u'info:fedora/{0}'.format(c),
'type': u'uri'
})
for m in models:
ds[NS.fedora_model.hasModel].append({
'value': u'info:fedora/{0}'.format(m),
'type': u'uri'
})
ds.setContent()
def add_master(self, pid, filename, mimeType):
"""
Add the Master file as a datastream.
"""
logging.debug("Adding Master datastream")
if pid is not None:
obj = self.fedora.getObject(pid)
obj.addDataStream('OBJ', 'tempData', label=u'Archival Image',
checksumType=u'MD5',
mimeType=mimeType, controlGroup=u'M',
logMessage=u'Added Master image')
with open(filename, 'rb') as fp:
obj['OBJ'].setContent(fp)
def add_exif(self, pid, filename):
"""
Extract the EXIF metadata and add it to Fedora.
"""
logging.debug("Extracting EXIF metadata")
p = subprocess.Popen(["exiftool", "-X", filename],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = p.communicate()
if err:
logging.debug(err)
if pid is not None:
obj = self.fedora.getObject(pid)
obj.addDataStream('EXIF', out, label=u'EXIF Metadata',
checksumType=u'DISABLED',
mimeType=u'text/xml', controlGroup=u'X',
logMessage=u'Added EXIF Metadata')
def add_jp2(self, pid, source):
"""
Convert the file to JPEG 2000 format and add it
to fedora as a datastream.
"""
filename = None
with tempfile.NamedTemporaryFile(delete=False, suffix='.jp2') as fp:
filename = fp.name
logging.debug("Converting to jp2; adding to fedora")
p = subprocess.Popen([CONVERT, source, "-define", "jp2:tilewidth=256",
"-define", "jp2:tileheight=256", filename],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = p.communicate()
if err:
logging.debug(err)
if pid is not None:
obj = self.fedora.getObject(pid)
obj.addDataStream('JP2', 'tempData', label=u'Pyramid JPEG2000 Image',
checksumType=u'MD5',
mimeType=u'image/jp2', controlGroup=u'M',
logMessage=u'Added JPEG 2000 Image')
with open(filename, 'rb') as fp:
obj['JP2'].setContent(fp)
os.unlink(filename)
def add_png(self, pid, source, dsid, delete=True):
"""
Create an unscaled png image and add it to fedora as a datastream.
"""
logging.debug("Creating {0} png".format(dsid))
filename = None
with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as fp:
filename = fp.name
p = subprocess.Popen([CONVERT, source, filename],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = p.communicate()
if err:
logging.debug(err)
if pid is not None:
obj = self.fedora.getObject(pid)
obj.addDataStream(dsid, 'tempData',
checksumType=u'MD5',
label=u"{0} png image".format(dsid),
mimeType=u'image/png', controlGroup=u'M')
with open(filename, 'rb') as fp:
obj[dsid].setContent(fp)
if delete:
os.unlink(filename)
else:
return filename
def add_jpeg(self, pid, source, dsid, delete=True):
"""
Create an unscaled jpeg image and add it to fedora as a datastream.
"""
logging.debug("Creating {0} jpeg".format(dsid))
filename = None
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as fp:
filename = fp.name
p = subprocess.Popen([CONVERT, source, filename],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = p.communicate()
if err:
logging.debug(err)
if pid is not None:
obj = self.fedora.getObject(pid)
obj.addDataStream(dsid, 'tempData',
checksumType=u'MD5',
label=u"{0} jpeg image".format(dsid),
mimeType=u'image/jpeg', controlGroup=u'M')
with open(filename, 'rb') as fp:
obj[dsid].setContent(fp)
if delete:
os.unlink(filename)
else:
return filename
def add_scaled_jpeg(self, pid, source, dsid, width, height, delete=True):
"""
Create a scaled jpeg image and add it to fedora as a datastream.
"""
logging.debug("Creating {0} jpeg".format(dsid))
filename = None
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as fp:
filename = fp.name
p = subprocess.Popen([CONVERT, source, "-resize",
">{0}x{1}".format(width, height), filename],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = p.communicate()
if err:
logging.debug(err)
if pid is not None:
obj = self.fedora.getObject(pid)
obj.addDataStream(dsid, 'tempData',
checksumType=u'MD5',
label=u"{0} jpeg image".format(dsid),
mimeType=u'image/jpeg', controlGroup=u'M')
with open(filename, 'rb') as fp:
obj[dsid].setContent(fp)
if delete:
os.unlink(filename)
else:
return filename
def add_cropped_jpeg(self, pid, source, dsid, width, height):
"""
Create a scaled and cropped jpeg image and add it to fedora.
"""
logging.debug("Creating {0} jpeg".format(dsid))
filename = None
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as fp:
filename = fp.name
p = subprocess.Popen([CONVERT, source, "-resize",
">{0}x{1}^".format(width, height), "-gravity", "North",
"-extent", "{0}x{1}".format(width, height), filename],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = p.communicate()
if err:
logging.debug(err)
if pid is not None:
obj = self.fedora.getObject(pid)
obj.addDataStream(dsid, 'tempData',
checksumType=u'MD5',
label=u'{0} jpeg image'.format(dsid),
mimeType=u'image/jpeg', controlGroup=u'M',
logMessage=u'Added {0} image'.format(dsid))
with open(filename, 'rb') as fp:
obj[dsid].setContent(fp)
os.unlink(filename)
def ingest(self, filename):
pass
import traceback
import time
import sys
import os
import urllib2
import re
import tempfile
import logging
from contextlib import contextmanager
from fcrepo.utils import NS
from lxml import etree
from . import batchingester
from . import xacml
class ImageIngester(batchingester.BatchIngester):
vra2mods = None
mods2dc = None
def __init__(self, config, namespace, dry_run=False):
super(ImageIngester, self).__init__(config, namespace, dry_run)
mods_to_dc = self.config.get("XSLT", "mods2dc")
self.reset()
if os.path.exists(mods_to_dc):
self.mods2dc = etree.XSLT(etree.parse(mods_to_dc))
vra_to_mods = self.config.get("XSLT", "vra2mods")
if os.path.exists(vra_to_mods):
self.vra2mods = etree.XSLT(etree.parse(vra_to_mods))
def extract_metadata(self):
"""
Extract metadata from the METS record.
"""
# Extract the MODS datastream
mods = self.dom.xpath("//mods:mods",
namespaces={'mods': 'http://www.loc.gov/mods/v3'})
self.mods_xml = etree.tostring(mods[0],
pretty_print=True) if mods else None
# Extract the VRA Core datastream if it exists.
vra = self.dom.xpath("//vra:vra",
namespaces={'vra': 'http://www.vraweb.org/vracore4.htm'})
self.vra_xml = etree.tostring(vra[0],
pretty_print=True) if vra else None
# Convert VRA to MODS if it doesn't already exist
if self.mods_xml is None and self.vra2mods:
self.mods_xml = etree.tostring(self.vra2mods(vra[0]),
pretty_print=True)
# Extract title and description from MODS
if self.mods_xml:
mods = etree.fromstring(self.mods_xml)
node = mods.xpath("//mods:title/text()",
namespaces={'mods': 'http://www.loc.gov/mods/v3'})
self.title = re.sub(r"\s+", " ", node[0].strip()) \
if node else u"Untitled"
if len(self.title) > 256:
self.title = re.sub(r":.*", "", self.title)
if len(self.title) > 256:
self.title = re.sub(r";.*", "", self.title)
if len(self.title) > 256:
self.title = re.sub(r"\s[^\s]*$", "", self.title[:255])
if len(self.title) > 256:
self.title = self.title[:250]
node = mods.xpath("//mods:abstract/text()",
namespaces={'mods': 'http://www.loc.gov/mods/v3'})
self.description = re.sub(r"\s+", " ", node[0].strip()) \
if node else ""
if self.mods2dc:
self.dc_xml = etree.tostring(self.mods2dc(mods),
pretty_print=True)
else:
logging.warning("No MODS metadata available")
# Extract file reference
files = self.dom.xpath("//mets:fileSec//mets:file",
namespaces={'mets': 'http://www.loc.gov/METS/'})
if len(files) != 1:
logging.error("Can only process METS record with a single file!")
return 0
file = files[0]
self.archive_file = {
'mimeType': file.get("MIMETYPE"),
'locType': file[0].get("LOCTYPE"),
'uri': file[0].get(
"{http://www.w3.org/1999/xlink}href")
}
return 1
def check_files(self):
"""
Verify that the archive file is accessible.
"""
if self.archive_file:
try:
urllib2.urlopen(self.archive_file["uri"])
return 1
except:
logging.warning("Could not access uri: {0}".format(
self.archive_file["uri"]))
if self.dry_run:
return 0
else:
raise
def reset(self):
"""
Clear all item-based state data.
"""
self.title = None
self.description = None
self.archive_file = None
self.dom = None
self.mods_xml = None
self.vra_xml = None
self.dc_xml = None
def digital_object(self):
# Create the fedora object
pid = None
# Add a POLICY datastream for this object
policy = xacml.XACML()
policy.deny('deny-datastreams',
resources={
'urn:fedora:names:fedora:2.1:resource:datastream:id': [
'POLICY', 'RELS-EXT']},
actions={
'urn:fedora:names:fedora:2.1:action:id': [
'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']})
policy.deny('deny-apim',
actions={
'urn:fedora:names:fedora:2.1:action:api': [
'urn:fedora:names:fedora:2.1:action:api-m']},
groups=['permitted', 'group', 'names'])
policy.deny('deny-apia',
actions={
'urn:fedora:names:fedora:2.1:action:api': [
'urn:fedora:names:fedora:2.1:action:api-a'],
'urn:fedora:names:fedora:2.1:action:id': [
'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']},
groups=['permitted', 'group', 'names'])
policy.permit('everything-else')
if self.dry_run:
logging.info("Processing new object")
else:
pid = self.fedora.getNextPID(unicode(self.namespace))
logging.info("Creating object with PID: {0}".format(pid))
obj = self.fedora.createObject(pid,
label=unicode(self.title.encode("ascii",
errors="xmlcharrefreplace")))
obj.ownerId = self.fedoraUser
self.add_rels_ext(pid, ['my:contentModel1',
'my:contentModel2'])
obj.addDataStream('POLICY', policy.policy(),
label=u'XACML Policy',
checksumType=u'DISABLED',
logMessage=u'Added POLICY datastream')
return pid
def vra_object(self, pid):
pidVRA = None
if pid is not None:
pidVRA = self.fedora.getNextPID(unicode(self.namespace))
obj = self.fedora.createObject(pidVRA,
label=u"VRA Metadata for {0}".format(pid))
obj.ownerId = self.fedoraUser
obj.addDataStream('RELS-EXT',
checksumType=u'DISABLED',
label=u"RDF Statements about this object")
ds = obj['RELS-EXT']
ds[NS.fedora.isMetadataFor].append({
'value': u'info:fedora/{0}'.format(pid),
'type': u'uri'
})
ds[NS.fedora_model.hasModel].append({
'value': u'info:fedora/my:vraContentModel',
'type': u'uri'
})
ds.setContent()
self.derivatives.append(pidVRA)
self.add_vra(pidVRA, pid)
return pidVRA
def master_object(self, pid, filename):
pidObj = None
# Add a POLICY datastream for this object
policy = xacml.XACML()
policy.deny('deny-datastreams',
resources={
'urn:fedora:names:fedora:2.1:resource:datastream:id': [
'POLICY', 'RELS-EXT', 'OBJ']},
actions={
'urn:fedora:names:fedora:2.1:action:id': [
'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']})
policy.deny('deny-apim',
actions={
'urn:fedora:names:fedora:2.1:action:api': [
'urn:fedora:names:fedora:2.1:action:api-m']})
policy.deny('deny-apia',
actions={
'urn:fedora:names:fedora:2.1:action:api': [
'urn:fedora:names:fedora:2.1:action:api-a'],
'urn:fedora:names:fedora:2.1:action:id': [
'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']},
groups=['permitted', 'group', 'names'])
policy.permit('everything-else')
if pid is not None:
pidObj = self.fedora.getNextPID(unicode(self.namespace))
obj = self.fedora.createObject(pidObj,
label=u"Master Object for {0}".format(pid))
obj.ownerId = self.fedoraUser
obj.addDataStream('RELS-EXT',
label=u'RDF Statements about this object',
checksumType=u'DISABLED')
ds = obj['RELS-EXT']
ds[NS.fedora.isPartOf].append({
'value': u'info:fedora/{0}'.format(pid),
'type': u'uri'
})
ds[NS.fedora_model.hasModel].append({
'value': u'info:fedora/my:archivalContentModel',
'type': u'uri'
})
ds.setContent()
obj.addDataStream('POLICY', policy.policy(),
checksumType=u'DISABLED',
label=u'XACML Policy',
logMessage=u'Added POLICY datastream')
self.derivatives.append(pidObj)
self.add_master(pidObj, filename, unicode(self.archive_file['mimeType']))
return pidObj
def exif_object(self, pid, filename):
pidEXIF = None
if pid is not None:
pidEXIF = self.fedora.getNextPID(unicode(self.namespace))
obj = self.fedora.createObject(pidEXIF,
label=u"EXIF Metadata for {0}".format(pid))
obj.ownerId = self.fedoraUser
obj.addDataStream('RELS-EXT',
checksumType=u'DISABLED',
label=u'RDF Statements about this object')
ds = obj['RELS-EXT']
ds[NS.fedora.isMetadataFor].append({
'value': u'info:fedora/{0}'.format(pid),
'type': u'uri'})
ds[NS.fedora_model.hasModel].append({
'value': u'info:fedora/my:exifContentModel',
'type': u'uri'})
ds[NS.fedora.isDerivationOf].append({
'value': u'info:fedora/{0}'.format(pid),
'type': u'uri'})
ds.setContent()
self.derivatives.append(pidEXIF)
self.add_exif(pidEXIF, filename)
return pidEXIF
def jp2_object(self, pid, original, filename):
pidJP2 = None
# Add a POLICY datastream for this object
policy = xacml.XACML()
policy.deny('deny-datastreams',
resources={
'urn:fedora:names:fedora:2.1:resource:datastream:id': [
'POLICY', 'RELS-EXT']},
actions={
'urn:fedora:names:fedora:2.1:action:id': [
'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']})
policy.deny('deny-apim',
actions={
'urn:fedora:names:fedora:2.1:action:api': [
'urn:fedora:names:fedora:2.1:action:api-m']},
users=['librarian'])
policy.deny('deny-apia',
actions={
'urn:fedora:names:fedora:2.1:action:api': [
'urn:fedora:names:fedora:2.1:action:api-a'],
'urn:fedora:names:fedora:2.1:action:id': [
'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']},
groups=['permitted', 'group', 'names'])
policy.permit('everything-else')
if pid is not None:
pidJP2 = self.fedora.getNextPID(unicode(self.namespace))
obj = self.fedora.createObject(pidJP2,
label=u"JP2 Object for {0}".format(pid))
obj.ownerId = self.fedoraUser
obj.addDataStream('RELS-EXT',
checksumType=u'DISABLED',
label=u'RDF Statements about this object')
ds = obj['RELS-EXT']
ds[NS.fedora.isPartOf].append({
'value': u'info:fedora/{0}'.format(pid),
'type': u'uri'
})
ds[NS.fedora.isDerivationOf].append({
'value': u'info:fedora/{0}'.format(original),
'type': u'uri'
})
ds[NS.fedora_model.hasModel].append({
'value': u'info:fedora/my:jp2ContentModel',
'type': u'uri'
})
ds.setContent()
obj.addDataStream('POLICY', policy.policy(),
checksumType=u'DISABLED',
label=u'XACML Policy',
logMessage=u'Added POLICY datastream')
self.derivatives.append(pidJP2)
self.add_jp2(pidJP2, filename)
return pidJP2
def jpeg_object(self, pid, original, filename):
pidJPEG = None
# Add a POLICY datastream for this object
policy = xacml.XACML()
policy.deny('deny-datastreams',
resources={
'urn:fedora:names:fedora:2.1:resource:datastream:id': [
'POLICY', 'RELS-EXT']},
actions={
'urn:fedora:names:fedora:2.1:action:id': [
'urn:fedora:names:fedora:2.1:action:id-getDatastreamDissemination']})
policy.deny('deny-apim',
actions={
'urn:fedora:names:fedora:2.1:action:api': [
'urn:fedora:names:fedora:2.1:action:api-m']},
users=['librarian'])
policy.deny('deny-apia',
actions={
'urn:fedora:names:fedora:2.1:action:api': [
'urn:fedora:names:fedora:2.1:action:api-a'],
'urn:fedora:names:fedora:2.1:action:id': [
'urn:fedora:names:fedora:2.1:action:id-listObjectInResourceIndexResults']},
groups=['permitted', 'user', 'groups'])
policy.permit('everything-else')
if pid is not None:
pidJPEG = self.fedora.getNextPID(unicode(self.namespace))
obj = self.fedora.createObject(pidJPEG,
label=u'JPEG Object for {0}'.format(pid))
obj.ownerId = self.fedoraUser
obj.addDataStream('RELS-EXT',
checksumType=u'DISABLED',
label=u'RDF Statements about this object')
ds = obj['RELS-EXT']
ds[NS.fedora.isPartOf].append({
'value': u'info:fedora/{0}'.format(pid),
'type': u'uri'
})
ds[NS.fedora_model.hasModel].append({
'value': u'info:fedora/my:jpegContentModel',
'type': u'uri'
})
ds[NS.fedora.isDerivationOf].append({
'value': u'info:fedora/{0}'.format(original),
'type': u'uri'
})
ds.setContent()
obj.addDataStream('POLICY', policy.policy(),
checksumType=u'DISABLED',
label=u'XACML Policy',
logMessage=u'Added POLICY datastream')
self.derivatives.append(pidJPEG)
self.add_jpeg(pidJPEG, filename, "FULL_SIZE")
# Use the medium image for generating the smaller items.
medium = self.add_scaled_jpeg(pidJPEG, filename, "MEDIUM_SIZE",
640, 480, delete=False)
# Add these to the main object!
self.add_scaled_jpeg(pid, medium, "SMALL_SIZE", 200, 150)
self.add_scaled_jpeg(pid, medium, "TN", 88, 66)
self.add_cropped_jpeg(pid, medium, "ICON", 32, 24)
os.unlink(medium)
return pidJPEG
def ingest(self, filename):
self.reset()
self.dom = etree.parse(filename)
if self.extract_metadata():
if self.check_files():
# Get the archive file
pid = None
pids = []
try:
with self.local_copy() as filename:
# Create the main object
pid = self.digital_object()
pids.append(pid)
# Create a VRA Core object which
# 'isMetadataFor' the main object
vra_pid = self.vra_object(pid)
pids.append(vra_pid)
# Create an object part for the master image
master_pid = self.master_object(pid, filename)
pids.append(vra_pid)
# Create an object for the EXIF metadata
# which 'isMetadataFor' the Master file
exif_pid = self.exif_object(master_pid, filename)
pids.append(exif_pid)
# Create a JPEG 2000 object
jp2_pid = self.jp2_object(pid, master_pid, filename)
pids.append(jp2_pid)
# Create a JPEG object
jpeg_pid = self.jpeg_object(pid, master_pid, filename)
pids.append(jpeg_pid)
# Add the MODS datastream to the main digital object.
# This is done last so that the VRA object has already
# been added when fedoragsearch sees the MODS record added.
self.add_mods(pid)
except:
if pid:
logging.error("Error processing PID: {0}".format(pid))
if len(pids):
logging.error("May need to clean up the following pids: {0}".format(", ".join(pids)))
else:
logging.error("Error processing file")
traceback.print_exc(6)
return pid
@contextmanager
def local_copy(self):
"""
Store the master file locally for easy retrieval.
"""
logging.debug("Saving file locally.")
filename = None
ext = os.path.splitext(self.archive_file['uri'])
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as fp:
req = urllib2.urlopen(self.archive_file['uri'])
CHUNK = 16 * 1024
while True:
chunk = req.read(CHUNK)
if not chunk:
break
fp.write(chunk)
filename = fp.name
try:
yield filename
except Exception as e:
logging.error(e)
finally:
os.unlink(filename)
def add_mods(self, pid):
"""
Add the MODS datastream
"""
if self.mods_xml and pid is not None:
NS = {"mods": "http://www.loc.gov/mods/v3"}
parser = etree.XMLParser(remove_blank_text=True)
doc = etree.fromstring(self.mods_xml, parser)
ident = doc.xpath("/mods:mods/mods:identifier[@type='uri']",
namespaces=NS)
if len(ident):
ident[0].text = u"https://my.repository.url/{0}".format(pid)
else:
identifier = etree.Element(u"{http://www.loc.gov/mods/v3}identifier", type=u"uri")
identifier.text = u"https://my.repository.url/{0}".format(pid)
ident = doc.xpath("/mods:mods/mods:identifier", namespaces=NS)
if len(ident):
ident[-1].addnext(identifier)
else:
doc.append(identifier)
loc = doc.xpath("/mods:mods/mods:location", namespaces=NS)
if not len(loc):
loc = [etree.SubElement(doc, u"{http://www.loc.gov/mods/v3}location")]
url = doc.xpath("/mods:mods/mods:location/mods:url[@usage='primary display']", namespaces=NS)
if len(url):
url[0].text = u"http://my.repository.url/{0}".format(pid)
else:
url = etree.Element(u"{http://www.loc.gov/mods/v3}url", usage="primary display")
url.text = u"http://my.repository.url/{0}".format(pid)
loc[0].append(url)
self.mods_xml = etree.tostring(doc, pretty_print=True)
obj = self.fedora.getObject(pid)
obj.addDataStream('MODS', self.mods_xml,
label=u'MODS Metadata', mimeType=u'text/xml',
controlGroup=u'X',
checksumType=u'DISABLED',
logMessage=u'Added MODS datastream')
def add_vra(self, pid, parentPid):
"""
Add the VRA Core datastream, if available
"""
if self.vra_xml:
# Add the canonical URL for this object
dom = etree.fromstring(self.vra_xml)
nodes = dom.xpath("//vra:image",
namespaces={"vra": "http://www.vraweb.org/vracore4.htm"})
if len(nodes):
nodes[0].set("href",
"https://my.repository.url/{0}".format(parentPid))
self.vra_xml = etree.tostring(dom, pretty_print=True)
if pid is not None:
obj = self.fedora.getObject(pid)
obj.addDataStream('VRA', self.vra_xml,
label=u'VRA Core Metadata',
mimeType=u'text/xml', controlGroup=u'X',
checksumType=u'DISABLED',
logMessage=u'Added VRA Core Metadata')
import os
import argparse
import logging
from image import ImageIngester
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Ingest objects into fedora.')
# Set up the argument parser.
parser = argparse.ArgumentParser(
description='Ingest objects into fedora.')
parser.add_argument("--namespace", required=True,
choices=["my", "accepted", "name", "spaces"],
help="The namespace to be used by these objects.")
parser.add_argument("--collection", required=True,
help="The name of the collection")
parser.add_argument("--data", required=True,
help="The location of the data files to ingest.")
parser.add_argument("--parent", default="collection:root",
help="This will be the parent of this collection")
parser.add_argument("--config", default="my.cfg",
help="A configuration file containing " +
"the fedora connection information.")
parser.add_argument("--collectionTN", default="TN.png",
help="The location of a thumbnail image for the collection")
parser.add_argument("--dryrun", default=False, action='store_true')
args = parser.parse_args()
ingester = ImageIngester(args.config, args.namespace, dry_run=args.dryrun)
ingester.collection(args.collection, u"My Collection Name",
parent=args.parent, tn=args.collectionTN)
total = 0
for file in os.listdir(args.data):
if file.endswith("mets.xml"):
total += 1
i = 1
for file in os.listdir(args.data):
if file.endswith("mets.xml"):
logging.info("{0}/{1} Processing {2}".format(i, total, os.path.join(args.data, file)))
ingester.processMETS(os.path.join(args.data, file))
i += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment