helrond/downloads.json

## downloads.json
["2e41e5c2-c160-49ba-998e-cf0b830252be", "33b83ab9-52da-45bf-844e-e33406ca53e9", "4f8109b7-089b-459c-9db4-9b12ecc2ce3c", "5eb7e69a-6e2c-4bae-9465-2e75a445990b", "8a70f78b-6583-452f-b6b2-e3f87efef52b", "c9eb0981-3732-4282-8971-3ee4d4c5f6d0", "7a4fa374-5be2-4e23-b2e9-6d7dccb8ccbb", "b8fe2597-d35c-4cf3-ba49-2ecd5451532f", "d45215ee-f707-4a7b-a32d-af53b417ee80", "de5970b9-303e-4c37-b3a5-b10d9757dce7", "233592ac-3199-4aa5-a445-cc9e8819c61f", "6cad12f9-0948-4d67-868c-e41ab985d6bb", "1ccd7d15-067e-491f-ae52-746e398a5f1f", "23c6b2f6-373d-4084-96eb-ec5bfd4ef13e", "2d527e1a-bee0-4c19-bef1-587b20ccda53", "386a87de-9169-4b98-8e5c-c0e1ec87ec64", "490c9bac-fa41-4c10-bc28-3fea96fad8a5", "4b6078a5-fd70-4e9c-b7aa-6cc413e79ab2", "53d6a0f4-bc4d-40d9-8ca6-f19490c3e275", "5cf8a029-7481-4f30-bdfa-6fff5018c129", "6a37dea6-ae82-416c-8d3d-24c29a995f16", "8b15dd79-de2a-4ad3-9fb0-9aeaa8f1dbb5", "96000cf5-b832-4cc7-a87c-e6a8eb49e472", "a28a342e-4d98-42d4-9fa1-4c1daee84735", "a5d2438a-a962-40c5-8f9b-a67712554560", "a95064c4-f506-49c0-afb1-0edb484828c1", "c7f86a36-a249-468c-bda7-1b18e67f8b50", "ca7bafc1-8295-4de6-a3f0-4c839840f2f2", "4fa99e4d-b6de-40e1-a3af-15ee9d075f93", "9640670e-3360-47fc-82bd-a9e198dbd1b2", "9c509e13-f8e7-4cbd-9a95-6c1d8541df6a", "c2583bb9-84aa-4eb3-a800-849416ca9254", "e00e3342-63e4-4c78-bf0d-a8b22230535f", "d911877e-d1ab-446b-b3b2-c0ed208574c5", "0a5b4a06-afa5-48ff-b4eb-d5adbc7b2835", "49a6648d-8cd1-4848-8eb6-c0ddba24075f", "8d1e956e-31c5-4a3f-ba27-8bd925f9e46c", "aec06da8-d779-4c66-bf01-0a436e64a1e0", "aef1bdcc-8d77-48f9-9bb2-302f2d8071ce", "cbe2e617-30a6-48aa-a112-2eae64eadebf", "d75d05e7-5298-476c-a1a4-e15ea0fa90a7", "12d5f39e-f5f3-4bdd-bd3d-f983462108f5", "b167fa00-8e0d-4510-927d-8810038b4eff", "5958f725-06ea-4cea-8a0d-c0a9d207889b", "8d5508cd-837d-4abd-8fef-92d89e8b633c", "e953565e-fc11-4504-ad9d-817a358a18b4", "bb591517-0fec-4b1a-a4ba-fd91a7d250f9", "a298fe1a-d0cd-418a-9b14-e66d67bc4551", "5015ea01-e922-4bbd-b7e9-6858306cb643", "c4f92531-2c96-494a-b0cf-4dd4873e2b24", "ac14587b-04da-4453-86bc-0c73249f019f", "014f1ac4-9262-4c86-8167-953abb345570", "3911d8e4-4d6c-4bfb-a219-35b3fed111f1", "8d334cca-ddf4-49bd-ba6a-a2720784fe27", "cd40fcae-31ff-4576-8792-96fbb3fe0dd9", "edefc6af-abd2-424d-81e6-77d6fc97b9b9", "79793848-0f43-4149-9ce0-728dd888d3c8", "5e46e33b-8fa2-4865-9f95-7c6c06c752d0", "00ae2eb8-fba3-4855-b948-eefc29c1f4bf", "7093da53-8cac-4d64-84c8-2cc7ed0866b3", "b53268d4-f1e3-4c12-9747-37aa1ad0fc6f", "daa6a68c-40fe-4e09-96b2-2afe2f54deab", "6a9e089d-087f-4cee-bb31-dbbb56dc0728", "16d47d53-3714-4d00-809a-7f33ccbac483", "7deffba6-0b8f-42c8-b51d-ef867cdcbf4b", "2d122b95-7627-491e-902b-c08cd5338915"]

## fetch_dip.py
#! usr/bin/env python

# fetch_dip.py
# This script is designed to be run at regular intervals, for example from a crontab.
#
# Downloads a DIP from Archivematica to the TMP_DIR and extracts the tarball.
# Derivatives are created for each file in its objects directory, and they are moved,
# along with the original file, to the DESTINATION_DIR.
#
# Tested on Python 3.7.0. Requires Python requests library (http://docs.python-requests.org/en/master/)
# and Imagemagick with Ghostscript ()

import glob
import json
import logging
import os
import requests
import shutil
import subprocess
import tarfile

# Logging
LOG_FILE = 'fetch-dip-log.txt'
LOG_LEVEL = 'INFO'
# System locations
DESTINATION_DIR = '/am/dest/'
TMP_DIR = '/am/tmp/'
# File to store UUIDs of already-downloaded DIPs
DOWNLOADED_DIPS_FILE = '/am/downloads.json'
# Archivematica configs
ARCHIVEMATICA_USERNAME = 'user'
ARCHIVEMATICA_API_KEY = 'apikey'
ARCHIVEMATICA_HEADERS = {"Authorization": "ApiKey {}:{}".format(ARCHIVEMATICA_USERNAME, ARCHIVEMATICA_API_KEY)}
ARCHIVEMATICA_BASEURL = 'http://archivematica-storage-service-url:port/api/v2/'
ARCHIVEMATICA_PIPELINE_UUID = 'pipeline-uuid'

logging.basicConfig(filename=LOG_FILE, format='%(asctime)s %(message)s', level=getattr(logging, LOG_LEVEL))


class ArchivematicaClientError(Exception): pass


class DIPFetcherError(Exception): pass


class DIPFetcher():
    def __init__(self):
        self.tmp = TMP_DIR
        self.dest = DESTINATION_DIR
        self.client = ArchivematicaClient()
        self.downloads = DOWNLOADED_DIPS_FILE
        for dir in [self.tmp, self.dest]:
            if not os.path.isdir(dir):
                raise DIPFetcherError("{} must be created".format(dir))
        if not os.path.isfile(self.downloads):
            raise DIPFetcherError("{} must be created".format(self.downloads))
        try:
            open(self.downloads, 'r')
        except json.decoder.JSONDecodeError:
            raise DIPFetcherError("{} is not valid JSON".format(self.downloads))


    def run(self):
        logging.info('*** Starting routine ***')
        package_count = 0

        # Load list of previously downloaded DIPs from external file
        with open(self.downloads, 'r') as f:
            downloaded_list = json.load(f)

        for package in self.client.retrieve_paged('file/', params={'package_type': 'DIP'}):
            if (package['origin_pipeline'].split('/')[-2] == ARCHIVEMATICA_PIPELINE_UUID) and (package['uuid'] not in downloaded_list):
                self.uuid = package['uuid']
                try:
                    self.download_package(package)
                    self.extract_objects(os.path.join(self.tmp, "{}.tar".format(self.uuid)), self.tmp)
                    downloaded_list.append(self.uuid)
                    self.make_derivatives()
                    self.move_files()
                    self.cleanup()
                    package_count += 1
                except Exception as e:
                    logging.error(e)
                    continue

        # Dump updated list of downloaded packages to external file
        with open(self.downloads, 'w') as f:
            json.dump(downloaded_list, f)

        logging.info('*** Routine complete. {} DIPs downloaded and processed ***'.format(package_count))

    def make_derivatives(self):
        logging.debug("Creating derivatives for {}".format(self.uuid))
        for object in self.objects:
            commands = (
             ('Thumbnail with a height of 100px', "convert {}[0] -thumbnail 'x100' `echo {}`".format(object, "{}_thumb.jpg".format(os.path.splitext(object)[0]))),
             ('Square thumbnail 75x75 px', "convert {}[0] -thumbnail '75x75^' -gravity 'Center' -crop '75x75+0+0' `echo {}`".format(object, "{}_thumb75.jpg".format(os.path.splitext(object)[0]))),
             ('Square thumbnail 300x300 px', "convert {}[0] -thumbnail '300x300^' -gravity 'Center' -crop '300x300+0+0' `echo {}`".format(object, "{}_thumb300.jpg".format(os.path.splitext(object)[0]))),
             ('File with proportions of 1.9w to 1h', "convert {}[0] -gravity 'North' -crop '100%x53%+0+0' `echo {}`".format(object, "{}_thumbfb.jpg".format(os.path.splitext(object)[0]))),
            )
            for cmd in commands:
                logging.debug(cmd[0])
                proc = subprocess.Popen(cmd[1], shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
                while True:
                    next_line = proc.stdout.readline().decode("utf-8")
                    if not next_line:
                        break
                    logging.debug(next_line)

                ecode = proc.wait()
                if ecode != 0:
                    continue

    def move_files(self):
        for obj in self.objects:
            for f in glob.glob("{}*".format(os.path.splitext(obj)[0])):
                logging.debug("Moving {} to {}".format(f, self.dest))
                os.rename(f, os.path.join(self.dest, os.path.basename(f)))

    def download_package(self, package_json):
        logging.debug("Downloading {}".format(self.uuid))
        response = self.client.retrieve('/file/{}/download/'.format(self.uuid), stream=True)
        extension = os.path.splitext(package_json['current_path'])[1]
        if not extension:
            extension = '.tar'
        with open(os.path.join(self.tmp, '{}{}'.format(self.uuid, extension)), "wb") as package:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    package.write(chunk)
        return package

    def extract_objects(self, archive, dest):
        logging.debug("Extracting {}".format(self.uuid))
        self.objects = []
        ext = os.path.splitext(archive)[1]
        if ext == '.tar':
            tf = tarfile.open(archive, 'r')
            tf.extractall(dest)
            for member in tf.members:
                if 'objects/' in member.name:
                    os.rename(os.path.join(dest, member.name), os.path.join(dest, os.path.basename(member.name)))
                    self.objects.append(os.path.join(dest, os.path.basename(member.name)))
            tf.close()
        else:
            raise DIPFetcherError("Unrecognized archive extension", ext)
        return dest

    def cleanup(self):
        logging.debug("Cleaning up {}".format(self.tmp))
        for d in os.listdir(self.tmp):
            file_path = os.path.join(self.tmp, d)
            if os.path.isfile(file_path):
                os.remove(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)


class ArchivematicaClient(object):
    def __init__(self):
        self.username = ARCHIVEMATICA_USERNAME
        self.api_key = ARCHIVEMATICA_API_KEY
        self.headers = ARCHIVEMATICA_HEADERS
        self.baseurl = ARCHIVEMATICA_BASEURL

    def retrieve(self, uri, *args, **kwargs):
        full_url = "/".join([self.baseurl.rstrip("/"), uri.lstrip("/")])
        response = requests.get(full_url, headers=self.headers, *args, **kwargs)
        if response:
            return response
        else:
            raise ArchivematicaClientError("Could not return a valid response for {}".format(full_url))

    def retrieve_paged(self, uri, *args, limit=10, **kwargs):
        full_url = "/".join([self.baseurl.rstrip("/"), uri.lstrip("/")])
        params = {"limit": limit, "offset": 0}
        if "params" in kwargs:
            params.update(**kwargs['params'])
            del kwargs['params']

        current_page = requests.get(full_url, params=params, headers=self.headers, **kwargs)
        if not current_page:
            raise ArchivematicaClientError("Authentication error while retrieving {}".format(full_url))
        current_json = current_page.json()
        if current_json.get('meta'):
            while current_json['meta']['offset'] <= current_json['meta']['total_count']:
                for obj in current_json['objects']:
                    yield obj
                if not current_json['meta']['next']: break
                params['offset'] += limit
                current_page = requests.get(full_url, params=params, headers=self.headers, **kwargs)
                current_json = current_page.json()
        else:
            raise ArchivematicaClientError("retrieve_paged doesn't know how to handle {}".format(full_url))

DIPFetcher().run()
	#! usr/bin/env python

	# fetch_dip.py
	# This script is designed to be run at regular intervals, for example from a crontab.
	#
	# Downloads a DIP from Archivematica to the TMP_DIR and extracts the tarball.
	# Derivatives are created for each file in its objects directory, and they are moved,
	# along with the original file, to the DESTINATION_DIR.
	#
	# Tested on Python 3.7.0. Requires Python requests library (http://docs.python-requests.org/en/master/)
	# and Imagemagick with Ghostscript ()

	import glob
	import json
	import logging
	import os
	import requests
	import shutil
	import subprocess
	import tarfile

	# Logging
	LOG_FILE = 'fetch-dip-log.txt'
	LOG_LEVEL = 'INFO'
	# System locations
	DESTINATION_DIR = '/am/dest/'
	TMP_DIR = '/am/tmp/'
	# File to store UUIDs of already-downloaded DIPs
	DOWNLOADED_DIPS_FILE = '/am/downloads.json'
	# Archivematica configs
	ARCHIVEMATICA_USERNAME = 'user'
	ARCHIVEMATICA_API_KEY = 'apikey'
	ARCHIVEMATICA_HEADERS = {"Authorization": "ApiKey {}:{}".format(ARCHIVEMATICA_USERNAME, ARCHIVEMATICA_API_KEY)}
	ARCHIVEMATICA_BASEURL = 'http://archivematica-storage-service-url:port/api/v2/'
	ARCHIVEMATICA_PIPELINE_UUID = 'pipeline-uuid'

	logging.basicConfig(filename=LOG_FILE, format='%(asctime)s %(message)s', level=getattr(logging, LOG_LEVEL))


	class ArchivematicaClientError(Exception): pass


	class DIPFetcherError(Exception): pass


	class DIPFetcher():
	def __init__(self):
	self.tmp = TMP_DIR
	self.dest = DESTINATION_DIR
	self.client = ArchivematicaClient()
	self.downloads = DOWNLOADED_DIPS_FILE
	for dir in [self.tmp, self.dest]:
	if not os.path.isdir(dir):
	raise DIPFetcherError("{} must be created".format(dir))
	if not os.path.isfile(self.downloads):
	raise DIPFetcherError("{} must be created".format(self.downloads))
	try:
	open(self.downloads, 'r')
	except json.decoder.JSONDecodeError:
	raise DIPFetcherError("{} is not valid JSON".format(self.downloads))


	def run(self):
	logging.info('* Starting routine *')
	package_count = 0

	# Load list of previously downloaded DIPs from external file
	with open(self.downloads, 'r') as f:
	downloaded_list = json.load(f)

	for package in self.client.retrieve_paged('file/', params={'package_type': 'DIP'}):
	if (package['origin_pipeline'].split('/')[-2] == ARCHIVEMATICA_PIPELINE_UUID) and (package['uuid'] not in downloaded_list):
	self.uuid = package['uuid']
	try:
	self.download_package(package)
	self.extract_objects(os.path.join(self.tmp, "{}.tar".format(self.uuid)), self.tmp)
	downloaded_list.append(self.uuid)
	self.make_derivatives()
	self.move_files()
	self.cleanup()
	package_count += 1
	except Exception as e:
	logging.error(e)
	continue

	# Dump updated list of downloaded packages to external file
	with open(self.downloads, 'w') as f:
	json.dump(downloaded_list, f)

	logging.info('* Routine complete. {} DIPs downloaded and processed *'.format(package_count))

	def make_derivatives(self):
	logging.debug("Creating derivatives for {}".format(self.uuid))
	for object in self.objects:
	commands = (
	('Thumbnail with a height of 100px', "convert {}[0] -thumbnail 'x100' `echo {}`".format(object, "{}_thumb.jpg".format(os.path.splitext(object)[0]))),
	('Square thumbnail 75x75 px', "convert {}[0] -thumbnail '75x75^' -gravity 'Center' -crop '75x75+0+0' `echo {}`".format(object, "{}_thumb75.jpg".format(os.path.splitext(object)[0]))),
	('Square thumbnail 300x300 px', "convert {}[0] -thumbnail '300x300^' -gravity 'Center' -crop '300x300+0+0' `echo {}`".format(object, "{}_thumb300.jpg".format(os.path.splitext(object)[0]))),
	('File with proportions of 1.9w to 1h', "convert {}[0] -gravity 'North' -crop '100%x53%+0+0' `echo {}`".format(object, "{}_thumbfb.jpg".format(os.path.splitext(object)[0]))),
	)
	for cmd in commands:
	logging.debug(cmd[0])
	proc = subprocess.Popen(cmd[1], shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
	while True:
	next_line = proc.stdout.readline().decode("utf-8")
	if not next_line:
	break
	logging.debug(next_line)

	ecode = proc.wait()
	if ecode != 0:
	continue

	def move_files(self):
	for obj in self.objects:
	for f in glob.glob("{}*".format(os.path.splitext(obj)[0])):
	logging.debug("Moving {} to {}".format(f, self.dest))
	os.rename(f, os.path.join(self.dest, os.path.basename(f)))

	def download_package(self, package_json):
	logging.debug("Downloading {}".format(self.uuid))
	response = self.client.retrieve('/file/{}/download/'.format(self.uuid), stream=True)
	extension = os.path.splitext(package_json['current_path'])[1]
	if not extension:
	extension = '.tar'
	with open(os.path.join(self.tmp, '{}{}'.format(self.uuid, extension)), "wb") as package:
	for chunk in response.iter_content(chunk_size=1024):
	if chunk:
	package.write(chunk)
	return package

	def extract_objects(self, archive, dest):
	logging.debug("Extracting {}".format(self.uuid))
	self.objects = []
	ext = os.path.splitext(archive)[1]
	if ext == '.tar':
	tf = tarfile.open(archive, 'r')
	tf.extractall(dest)
	for member in tf.members:
	if 'objects/' in member.name:
	os.rename(os.path.join(dest, member.name), os.path.join(dest, os.path.basename(member.name)))
	self.objects.append(os.path.join(dest, os.path.basename(member.name)))
	tf.close()
	else:
	raise DIPFetcherError("Unrecognized archive extension", ext)
	return dest

	def cleanup(self):
	logging.debug("Cleaning up {}".format(self.tmp))
	for d in os.listdir(self.tmp):
	file_path = os.path.join(self.tmp, d)
	if os.path.isfile(file_path):
	os.remove(file_path)
	elif os.path.isdir(file_path):
	shutil.rmtree(file_path)


	class ArchivematicaClient(object):
	def __init__(self):
	self.username = ARCHIVEMATICA_USERNAME
	self.api_key = ARCHIVEMATICA_API_KEY
	self.headers = ARCHIVEMATICA_HEADERS
	self.baseurl = ARCHIVEMATICA_BASEURL

	def retrieve(self, uri, args, *kwargs):
	full_url = "/".join([self.baseurl.rstrip("/"), uri.lstrip("/")])
	response = requests.get(full_url, headers=self.headers, args, *kwargs)
	if response:
	return response
	else:
	raise ArchivematicaClientError("Could not return a valid response for {}".format(full_url))

	def retrieve_paged(self, uri, args, limit=10, *kwargs):
	full_url = "/".join([self.baseurl.rstrip("/"), uri.lstrip("/")])
	params = {"limit": limit, "offset": 0}
	if "params" in kwargs:
	params.update(**kwargs['params'])
	del kwargs['params']

	current_page = requests.get(full_url, params=params, headers=self.headers, **kwargs)
	if not current_page:
	raise ArchivematicaClientError("Authentication error while retrieving {}".format(full_url))
	current_json = current_page.json()
	if current_json.get('meta'):
	while current_json['meta']['offset'] <= current_json['meta']['total_count']:
	for obj in current_json['objects']:
	yield obj
	if not current_json['meta']['next']: break
	params['offset'] += limit
	current_page = requests.get(full_url, params=params, headers=self.headers, **kwargs)
	current_json = current_page.json()
	else:
	raise ArchivematicaClientError("retrieve_paged doesn't know how to handle {}".format(full_url))

	DIPFetcher().run()