Last active
November 30, 2018 17:57
-
-
Save helrond/6df127b859d415776d1c63700f2e6638 to your computer and use it in GitHub Desktop.
Downloads, and extracts an Archivematica DIP. Creates derivatives for objects and moves them to flat directory structure.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
["2e41e5c2-c160-49ba-998e-cf0b830252be", "33b83ab9-52da-45bf-844e-e33406ca53e9", "4f8109b7-089b-459c-9db4-9b12ecc2ce3c", "5eb7e69a-6e2c-4bae-9465-2e75a445990b", "8a70f78b-6583-452f-b6b2-e3f87efef52b", "c9eb0981-3732-4282-8971-3ee4d4c5f6d0", "7a4fa374-5be2-4e23-b2e9-6d7dccb8ccbb", "b8fe2597-d35c-4cf3-ba49-2ecd5451532f", "d45215ee-f707-4a7b-a32d-af53b417ee80", "de5970b9-303e-4c37-b3a5-b10d9757dce7", "233592ac-3199-4aa5-a445-cc9e8819c61f", "6cad12f9-0948-4d67-868c-e41ab985d6bb", "1ccd7d15-067e-491f-ae52-746e398a5f1f", "23c6b2f6-373d-4084-96eb-ec5bfd4ef13e", "2d527e1a-bee0-4c19-bef1-587b20ccda53", "386a87de-9169-4b98-8e5c-c0e1ec87ec64", "490c9bac-fa41-4c10-bc28-3fea96fad8a5", "4b6078a5-fd70-4e9c-b7aa-6cc413e79ab2", "53d6a0f4-bc4d-40d9-8ca6-f19490c3e275", "5cf8a029-7481-4f30-bdfa-6fff5018c129", "6a37dea6-ae82-416c-8d3d-24c29a995f16", "8b15dd79-de2a-4ad3-9fb0-9aeaa8f1dbb5", "96000cf5-b832-4cc7-a87c-e6a8eb49e472", "a28a342e-4d98-42d4-9fa1-4c1daee84735", "a5d2438a-a962-40c5-8f9b-a67712554560", "a95064c4-f506-49c0-afb1-0edb484828c1", "c7f86a36-a249-468c-bda7-1b18e67f8b50", "ca7bafc1-8295-4de6-a3f0-4c839840f2f2", "4fa99e4d-b6de-40e1-a3af-15ee9d075f93", "9640670e-3360-47fc-82bd-a9e198dbd1b2", "9c509e13-f8e7-4cbd-9a95-6c1d8541df6a", "c2583bb9-84aa-4eb3-a800-849416ca9254", "e00e3342-63e4-4c78-bf0d-a8b22230535f", "d911877e-d1ab-446b-b3b2-c0ed208574c5", "0a5b4a06-afa5-48ff-b4eb-d5adbc7b2835", "49a6648d-8cd1-4848-8eb6-c0ddba24075f", "8d1e956e-31c5-4a3f-ba27-8bd925f9e46c", "aec06da8-d779-4c66-bf01-0a436e64a1e0", "aef1bdcc-8d77-48f9-9bb2-302f2d8071ce", "cbe2e617-30a6-48aa-a112-2eae64eadebf", "d75d05e7-5298-476c-a1a4-e15ea0fa90a7", "12d5f39e-f5f3-4bdd-bd3d-f983462108f5", "b167fa00-8e0d-4510-927d-8810038b4eff", "5958f725-06ea-4cea-8a0d-c0a9d207889b", "8d5508cd-837d-4abd-8fef-92d89e8b633c", "e953565e-fc11-4504-ad9d-817a358a18b4", "bb591517-0fec-4b1a-a4ba-fd91a7d250f9", "a298fe1a-d0cd-418a-9b14-e66d67bc4551", "5015ea01-e922-4bbd-b7e9-6858306cb643", "c4f92531-2c96-494a-b0cf-4dd4873e2b24", "ac14587b-04da-4453-86bc-0c73249f019f", "014f1ac4-9262-4c86-8167-953abb345570", "3911d8e4-4d6c-4bfb-a219-35b3fed111f1", "8d334cca-ddf4-49bd-ba6a-a2720784fe27", "cd40fcae-31ff-4576-8792-96fbb3fe0dd9", "edefc6af-abd2-424d-81e6-77d6fc97b9b9", "79793848-0f43-4149-9ce0-728dd888d3c8", "5e46e33b-8fa2-4865-9f95-7c6c06c752d0", "00ae2eb8-fba3-4855-b948-eefc29c1f4bf", "7093da53-8cac-4d64-84c8-2cc7ed0866b3", "b53268d4-f1e3-4c12-9747-37aa1ad0fc6f", "daa6a68c-40fe-4e09-96b2-2afe2f54deab", "6a9e089d-087f-4cee-bb31-dbbb56dc0728", "16d47d53-3714-4d00-809a-7f33ccbac483", "7deffba6-0b8f-42c8-b51d-ef867cdcbf4b", "2d122b95-7627-491e-902b-c08cd5338915"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! usr/bin/env python | |
# fetch_dip.py | |
# This script is designed to be run at regular intervals, for example from a crontab. | |
# | |
# Downloads a DIP from Archivematica to the TMP_DIR and extracts the tarball. | |
# Derivatives are created for each file in its objects directory, and they are moved, | |
# along with the original file, to the DESTINATION_DIR. | |
# | |
# Tested on Python 3.7.0. Requires Python requests library (http://docs.python-requests.org/en/master/) | |
# and Imagemagick with Ghostscript () | |
import glob | |
import json | |
import logging | |
import os | |
import requests | |
import shutil | |
import subprocess | |
import tarfile | |
# Logging | |
LOG_FILE = 'fetch-dip-log.txt' | |
LOG_LEVEL = 'INFO' | |
# System locations | |
DESTINATION_DIR = '/am/dest/' | |
TMP_DIR = '/am/tmp/' | |
# File to store UUIDs of already-downloaded DIPs | |
DOWNLOADED_DIPS_FILE = '/am/downloads.json' | |
# Archivematica configs | |
ARCHIVEMATICA_USERNAME = 'user' | |
ARCHIVEMATICA_API_KEY = 'apikey' | |
ARCHIVEMATICA_HEADERS = {"Authorization": "ApiKey {}:{}".format(ARCHIVEMATICA_USERNAME, ARCHIVEMATICA_API_KEY)} | |
ARCHIVEMATICA_BASEURL = 'http://archivematica-storage-service-url:port/api/v2/' | |
ARCHIVEMATICA_PIPELINE_UUID = 'pipeline-uuid' | |
logging.basicConfig(filename=LOG_FILE, format='%(asctime)s %(message)s', level=getattr(logging, LOG_LEVEL)) | |
class ArchivematicaClientError(Exception): pass | |
class DIPFetcherError(Exception): pass | |
class DIPFetcher(): | |
def __init__(self): | |
self.tmp = TMP_DIR | |
self.dest = DESTINATION_DIR | |
self.client = ArchivematicaClient() | |
self.downloads = DOWNLOADED_DIPS_FILE | |
for dir in [self.tmp, self.dest]: | |
if not os.path.isdir(dir): | |
raise DIPFetcherError("{} must be created".format(dir)) | |
if not os.path.isfile(self.downloads): | |
raise DIPFetcherError("{} must be created".format(self.downloads)) | |
try: | |
open(self.downloads, 'r') | |
except json.decoder.JSONDecodeError: | |
raise DIPFetcherError("{} is not valid JSON".format(self.downloads)) | |
def run(self): | |
logging.info('*** Starting routine ***') | |
package_count = 0 | |
# Load list of previously downloaded DIPs from external file | |
with open(self.downloads, 'r') as f: | |
downloaded_list = json.load(f) | |
for package in self.client.retrieve_paged('file/', params={'package_type': 'DIP'}): | |
if (package['origin_pipeline'].split('/')[-2] == ARCHIVEMATICA_PIPELINE_UUID) and (package['uuid'] not in downloaded_list): | |
self.uuid = package['uuid'] | |
try: | |
self.download_package(package) | |
self.extract_objects(os.path.join(self.tmp, "{}.tar".format(self.uuid)), self.tmp) | |
downloaded_list.append(self.uuid) | |
self.make_derivatives() | |
self.move_files() | |
self.cleanup() | |
package_count += 1 | |
except Exception as e: | |
logging.error(e) | |
continue | |
# Dump updated list of downloaded packages to external file | |
with open(self.downloads, 'w') as f: | |
json.dump(downloaded_list, f) | |
logging.info('*** Routine complete. {} DIPs downloaded and processed ***'.format(package_count)) | |
def make_derivatives(self): | |
logging.debug("Creating derivatives for {}".format(self.uuid)) | |
for object in self.objects: | |
commands = ( | |
('Thumbnail with a height of 100px', "convert {}[0] -thumbnail 'x100' `echo {}`".format(object, "{}_thumb.jpg".format(os.path.splitext(object)[0]))), | |
('Square thumbnail 75x75 px', "convert {}[0] -thumbnail '75x75^' -gravity 'Center' -crop '75x75+0+0' `echo {}`".format(object, "{}_thumb75.jpg".format(os.path.splitext(object)[0]))), | |
('Square thumbnail 300x300 px', "convert {}[0] -thumbnail '300x300^' -gravity 'Center' -crop '300x300+0+0' `echo {}`".format(object, "{}_thumb300.jpg".format(os.path.splitext(object)[0]))), | |
('File with proportions of 1.9w to 1h', "convert {}[0] -gravity 'North' -crop '100%x53%+0+0' `echo {}`".format(object, "{}_thumbfb.jpg".format(os.path.splitext(object)[0]))), | |
) | |
for cmd in commands: | |
logging.debug(cmd[0]) | |
proc = subprocess.Popen(cmd[1], shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE) | |
while True: | |
next_line = proc.stdout.readline().decode("utf-8") | |
if not next_line: | |
break | |
logging.debug(next_line) | |
ecode = proc.wait() | |
if ecode != 0: | |
continue | |
def move_files(self): | |
for obj in self.objects: | |
for f in glob.glob("{}*".format(os.path.splitext(obj)[0])): | |
logging.debug("Moving {} to {}".format(f, self.dest)) | |
os.rename(f, os.path.join(self.dest, os.path.basename(f))) | |
def download_package(self, package_json): | |
logging.debug("Downloading {}".format(self.uuid)) | |
response = self.client.retrieve('/file/{}/download/'.format(self.uuid), stream=True) | |
extension = os.path.splitext(package_json['current_path'])[1] | |
if not extension: | |
extension = '.tar' | |
with open(os.path.join(self.tmp, '{}{}'.format(self.uuid, extension)), "wb") as package: | |
for chunk in response.iter_content(chunk_size=1024): | |
if chunk: | |
package.write(chunk) | |
return package | |
def extract_objects(self, archive, dest): | |
logging.debug("Extracting {}".format(self.uuid)) | |
self.objects = [] | |
ext = os.path.splitext(archive)[1] | |
if ext == '.tar': | |
tf = tarfile.open(archive, 'r') | |
tf.extractall(dest) | |
for member in tf.members: | |
if 'objects/' in member.name: | |
os.rename(os.path.join(dest, member.name), os.path.join(dest, os.path.basename(member.name))) | |
self.objects.append(os.path.join(dest, os.path.basename(member.name))) | |
tf.close() | |
else: | |
raise DIPFetcherError("Unrecognized archive extension", ext) | |
return dest | |
def cleanup(self): | |
logging.debug("Cleaning up {}".format(self.tmp)) | |
for d in os.listdir(self.tmp): | |
file_path = os.path.join(self.tmp, d) | |
if os.path.isfile(file_path): | |
os.remove(file_path) | |
elif os.path.isdir(file_path): | |
shutil.rmtree(file_path) | |
class ArchivematicaClient(object): | |
def __init__(self): | |
self.username = ARCHIVEMATICA_USERNAME | |
self.api_key = ARCHIVEMATICA_API_KEY | |
self.headers = ARCHIVEMATICA_HEADERS | |
self.baseurl = ARCHIVEMATICA_BASEURL | |
def retrieve(self, uri, *args, **kwargs): | |
full_url = "/".join([self.baseurl.rstrip("/"), uri.lstrip("/")]) | |
response = requests.get(full_url, headers=self.headers, *args, **kwargs) | |
if response: | |
return response | |
else: | |
raise ArchivematicaClientError("Could not return a valid response for {}".format(full_url)) | |
def retrieve_paged(self, uri, *args, limit=10, **kwargs): | |
full_url = "/".join([self.baseurl.rstrip("/"), uri.lstrip("/")]) | |
params = {"limit": limit, "offset": 0} | |
if "params" in kwargs: | |
params.update(**kwargs['params']) | |
del kwargs['params'] | |
current_page = requests.get(full_url, params=params, headers=self.headers, **kwargs) | |
if not current_page: | |
raise ArchivematicaClientError("Authentication error while retrieving {}".format(full_url)) | |
current_json = current_page.json() | |
if current_json.get('meta'): | |
while current_json['meta']['offset'] <= current_json['meta']['total_count']: | |
for obj in current_json['objects']: | |
yield obj | |
if not current_json['meta']['next']: break | |
params['offset'] += limit | |
current_page = requests.get(full_url, params=params, headers=self.headers, **kwargs) | |
current_json = current_page.json() | |
else: | |
raise ArchivematicaClientError("retrieve_paged doesn't know how to handle {}".format(full_url)) | |
DIPFetcher().run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment