Skip to content

Instantly share code, notes, and snippets.

@helrond
Last active November 30, 2018 17:57
Show Gist options
  • Save helrond/6df127b859d415776d1c63700f2e6638 to your computer and use it in GitHub Desktop.
Save helrond/6df127b859d415776d1c63700f2e6638 to your computer and use it in GitHub Desktop.
Downloads, and extracts an Archivematica DIP. Creates derivatives for objects and moves them to flat directory structure.
["2e41e5c2-c160-49ba-998e-cf0b830252be", "33b83ab9-52da-45bf-844e-e33406ca53e9", "4f8109b7-089b-459c-9db4-9b12ecc2ce3c", "5eb7e69a-6e2c-4bae-9465-2e75a445990b", "8a70f78b-6583-452f-b6b2-e3f87efef52b", "c9eb0981-3732-4282-8971-3ee4d4c5f6d0", "7a4fa374-5be2-4e23-b2e9-6d7dccb8ccbb", "b8fe2597-d35c-4cf3-ba49-2ecd5451532f", "d45215ee-f707-4a7b-a32d-af53b417ee80", "de5970b9-303e-4c37-b3a5-b10d9757dce7", "233592ac-3199-4aa5-a445-cc9e8819c61f", "6cad12f9-0948-4d67-868c-e41ab985d6bb", "1ccd7d15-067e-491f-ae52-746e398a5f1f", "23c6b2f6-373d-4084-96eb-ec5bfd4ef13e", "2d527e1a-bee0-4c19-bef1-587b20ccda53", "386a87de-9169-4b98-8e5c-c0e1ec87ec64", "490c9bac-fa41-4c10-bc28-3fea96fad8a5", "4b6078a5-fd70-4e9c-b7aa-6cc413e79ab2", "53d6a0f4-bc4d-40d9-8ca6-f19490c3e275", "5cf8a029-7481-4f30-bdfa-6fff5018c129", "6a37dea6-ae82-416c-8d3d-24c29a995f16", "8b15dd79-de2a-4ad3-9fb0-9aeaa8f1dbb5", "96000cf5-b832-4cc7-a87c-e6a8eb49e472", "a28a342e-4d98-42d4-9fa1-4c1daee84735", "a5d2438a-a962-40c5-8f9b-a67712554560", "a95064c4-f506-49c0-afb1-0edb484828c1", "c7f86a36-a249-468c-bda7-1b18e67f8b50", "ca7bafc1-8295-4de6-a3f0-4c839840f2f2", "4fa99e4d-b6de-40e1-a3af-15ee9d075f93", "9640670e-3360-47fc-82bd-a9e198dbd1b2", "9c509e13-f8e7-4cbd-9a95-6c1d8541df6a", "c2583bb9-84aa-4eb3-a800-849416ca9254", "e00e3342-63e4-4c78-bf0d-a8b22230535f", "d911877e-d1ab-446b-b3b2-c0ed208574c5", "0a5b4a06-afa5-48ff-b4eb-d5adbc7b2835", "49a6648d-8cd1-4848-8eb6-c0ddba24075f", "8d1e956e-31c5-4a3f-ba27-8bd925f9e46c", "aec06da8-d779-4c66-bf01-0a436e64a1e0", "aef1bdcc-8d77-48f9-9bb2-302f2d8071ce", "cbe2e617-30a6-48aa-a112-2eae64eadebf", "d75d05e7-5298-476c-a1a4-e15ea0fa90a7", "12d5f39e-f5f3-4bdd-bd3d-f983462108f5", "b167fa00-8e0d-4510-927d-8810038b4eff", "5958f725-06ea-4cea-8a0d-c0a9d207889b", "8d5508cd-837d-4abd-8fef-92d89e8b633c", "e953565e-fc11-4504-ad9d-817a358a18b4", "bb591517-0fec-4b1a-a4ba-fd91a7d250f9", "a298fe1a-d0cd-418a-9b14-e66d67bc4551", "5015ea01-e922-4bbd-b7e9-6858306cb643", "c4f92531-2c96-494a-b0cf-4dd4873e2b24", "ac14587b-04da-4453-86bc-0c73249f019f", "014f1ac4-9262-4c86-8167-953abb345570", "3911d8e4-4d6c-4bfb-a219-35b3fed111f1", "8d334cca-ddf4-49bd-ba6a-a2720784fe27", "cd40fcae-31ff-4576-8792-96fbb3fe0dd9", "edefc6af-abd2-424d-81e6-77d6fc97b9b9", "79793848-0f43-4149-9ce0-728dd888d3c8", "5e46e33b-8fa2-4865-9f95-7c6c06c752d0", "00ae2eb8-fba3-4855-b948-eefc29c1f4bf", "7093da53-8cac-4d64-84c8-2cc7ed0866b3", "b53268d4-f1e3-4c12-9747-37aa1ad0fc6f", "daa6a68c-40fe-4e09-96b2-2afe2f54deab", "6a9e089d-087f-4cee-bb31-dbbb56dc0728", "16d47d53-3714-4d00-809a-7f33ccbac483", "7deffba6-0b8f-42c8-b51d-ef867cdcbf4b", "2d122b95-7627-491e-902b-c08cd5338915"]
#! usr/bin/env python
# fetch_dip.py
# This script is designed to be run at regular intervals, for example from a crontab.
#
# Downloads a DIP from Archivematica to the TMP_DIR and extracts the tarball.
# Derivatives are created for each file in its objects directory, and they are moved,
# along with the original file, to the DESTINATION_DIR.
#
# Tested on Python 3.7.0. Requires Python requests library (http://docs.python-requests.org/en/master/)
# and Imagemagick with Ghostscript ()
import glob
import json
import logging
import os
import requests
import shutil
import subprocess
import tarfile
# Logging
LOG_FILE = 'fetch-dip-log.txt'
LOG_LEVEL = 'INFO'
# System locations
DESTINATION_DIR = '/am/dest/'
TMP_DIR = '/am/tmp/'
# File to store UUIDs of already-downloaded DIPs
DOWNLOADED_DIPS_FILE = '/am/downloads.json'
# Archivematica configs
ARCHIVEMATICA_USERNAME = 'user'
ARCHIVEMATICA_API_KEY = 'apikey'
ARCHIVEMATICA_HEADERS = {"Authorization": "ApiKey {}:{}".format(ARCHIVEMATICA_USERNAME, ARCHIVEMATICA_API_KEY)}
ARCHIVEMATICA_BASEURL = 'http://archivematica-storage-service-url:port/api/v2/'
ARCHIVEMATICA_PIPELINE_UUID = 'pipeline-uuid'
logging.basicConfig(filename=LOG_FILE, format='%(asctime)s %(message)s', level=getattr(logging, LOG_LEVEL))
class ArchivematicaClientError(Exception): pass
class DIPFetcherError(Exception): pass
class DIPFetcher():
def __init__(self):
self.tmp = TMP_DIR
self.dest = DESTINATION_DIR
self.client = ArchivematicaClient()
self.downloads = DOWNLOADED_DIPS_FILE
for dir in [self.tmp, self.dest]:
if not os.path.isdir(dir):
raise DIPFetcherError("{} must be created".format(dir))
if not os.path.isfile(self.downloads):
raise DIPFetcherError("{} must be created".format(self.downloads))
try:
open(self.downloads, 'r')
except json.decoder.JSONDecodeError:
raise DIPFetcherError("{} is not valid JSON".format(self.downloads))
def run(self):
logging.info('*** Starting routine ***')
package_count = 0
# Load list of previously downloaded DIPs from external file
with open(self.downloads, 'r') as f:
downloaded_list = json.load(f)
for package in self.client.retrieve_paged('file/', params={'package_type': 'DIP'}):
if (package['origin_pipeline'].split('/')[-2] == ARCHIVEMATICA_PIPELINE_UUID) and (package['uuid'] not in downloaded_list):
self.uuid = package['uuid']
try:
self.download_package(package)
self.extract_objects(os.path.join(self.tmp, "{}.tar".format(self.uuid)), self.tmp)
downloaded_list.append(self.uuid)
self.make_derivatives()
self.move_files()
self.cleanup()
package_count += 1
except Exception as e:
logging.error(e)
continue
# Dump updated list of downloaded packages to external file
with open(self.downloads, 'w') as f:
json.dump(downloaded_list, f)
logging.info('*** Routine complete. {} DIPs downloaded and processed ***'.format(package_count))
def make_derivatives(self):
logging.debug("Creating derivatives for {}".format(self.uuid))
for object in self.objects:
commands = (
('Thumbnail with a height of 100px', "convert {}[0] -thumbnail 'x100' `echo {}`".format(object, "{}_thumb.jpg".format(os.path.splitext(object)[0]))),
('Square thumbnail 75x75 px', "convert {}[0] -thumbnail '75x75^' -gravity 'Center' -crop '75x75+0+0' `echo {}`".format(object, "{}_thumb75.jpg".format(os.path.splitext(object)[0]))),
('Square thumbnail 300x300 px', "convert {}[0] -thumbnail '300x300^' -gravity 'Center' -crop '300x300+0+0' `echo {}`".format(object, "{}_thumb300.jpg".format(os.path.splitext(object)[0]))),
('File with proportions of 1.9w to 1h', "convert {}[0] -gravity 'North' -crop '100%x53%+0+0' `echo {}`".format(object, "{}_thumbfb.jpg".format(os.path.splitext(object)[0]))),
)
for cmd in commands:
logging.debug(cmd[0])
proc = subprocess.Popen(cmd[1], shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
while True:
next_line = proc.stdout.readline().decode("utf-8")
if not next_line:
break
logging.debug(next_line)
ecode = proc.wait()
if ecode != 0:
continue
def move_files(self):
for obj in self.objects:
for f in glob.glob("{}*".format(os.path.splitext(obj)[0])):
logging.debug("Moving {} to {}".format(f, self.dest))
os.rename(f, os.path.join(self.dest, os.path.basename(f)))
def download_package(self, package_json):
logging.debug("Downloading {}".format(self.uuid))
response = self.client.retrieve('/file/{}/download/'.format(self.uuid), stream=True)
extension = os.path.splitext(package_json['current_path'])[1]
if not extension:
extension = '.tar'
with open(os.path.join(self.tmp, '{}{}'.format(self.uuid, extension)), "wb") as package:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
package.write(chunk)
return package
def extract_objects(self, archive, dest):
logging.debug("Extracting {}".format(self.uuid))
self.objects = []
ext = os.path.splitext(archive)[1]
if ext == '.tar':
tf = tarfile.open(archive, 'r')
tf.extractall(dest)
for member in tf.members:
if 'objects/' in member.name:
os.rename(os.path.join(dest, member.name), os.path.join(dest, os.path.basename(member.name)))
self.objects.append(os.path.join(dest, os.path.basename(member.name)))
tf.close()
else:
raise DIPFetcherError("Unrecognized archive extension", ext)
return dest
def cleanup(self):
logging.debug("Cleaning up {}".format(self.tmp))
for d in os.listdir(self.tmp):
file_path = os.path.join(self.tmp, d)
if os.path.isfile(file_path):
os.remove(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
class ArchivematicaClient(object):
def __init__(self):
self.username = ARCHIVEMATICA_USERNAME
self.api_key = ARCHIVEMATICA_API_KEY
self.headers = ARCHIVEMATICA_HEADERS
self.baseurl = ARCHIVEMATICA_BASEURL
def retrieve(self, uri, *args, **kwargs):
full_url = "/".join([self.baseurl.rstrip("/"), uri.lstrip("/")])
response = requests.get(full_url, headers=self.headers, *args, **kwargs)
if response:
return response
else:
raise ArchivematicaClientError("Could not return a valid response for {}".format(full_url))
def retrieve_paged(self, uri, *args, limit=10, **kwargs):
full_url = "/".join([self.baseurl.rstrip("/"), uri.lstrip("/")])
params = {"limit": limit, "offset": 0}
if "params" in kwargs:
params.update(**kwargs['params'])
del kwargs['params']
current_page = requests.get(full_url, params=params, headers=self.headers, **kwargs)
if not current_page:
raise ArchivematicaClientError("Authentication error while retrieving {}".format(full_url))
current_json = current_page.json()
if current_json.get('meta'):
while current_json['meta']['offset'] <= current_json['meta']['total_count']:
for obj in current_json['objects']:
yield obj
if not current_json['meta']['next']: break
params['offset'] += limit
current_page = requests.get(full_url, params=params, headers=self.headers, **kwargs)
current_json = current_page.json()
else:
raise ArchivematicaClientError("retrieve_paged doesn't know how to handle {}".format(full_url))
DIPFetcher().run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment