tgherzog/terre-biodiv.py

## terre-biodiv.py

"""
Upload files to a terre-biodiv s3 bucket

The first form uploads files from a local directory to an S3 bucket

The second form uses an S3 bucket as the source

Either BUCKET or SRCBUCKET can include a path prefix to control the
copy operation, e.g., wbg-terre-biodiv/data

Usage:
  terre-biodiv.py [--config=YAML] [--test] [--report] [--no-warnings] [--profile=NAME] [--root=DIRECTORY] BUCKET
  terre-biodiv.py [--config=YAML] [--test] [--report] [--no-warnings] [--profile=NAME] SRCBUCKET BUCKET

Options:
  --config=YAML      Config file [default: terre-biodiv.yaml]
  --test             Report operations only: don't upload
  --no-warnings      Don't warn if non-matching files are encountered
  --profile=NAME     AWS profile from .aws/credentials [default: default]
  --root=DIRECTORY   Root directory to traverse [default: .]
  --report           Provide a detailed report

"""

import os
import yaml
import boto3
import re
import sys
from docopt import docopt

config = docopt(__doc__)

# file_pattern_screen determines which files/keys are recognized and copied
file_pattern_screen = r'^(\w{3}) (\d{3}) ([^_]+)(_\d+)?.tif$'

def main():
    global config, filemap

    # first load a filename remapping matrix
    with open(config['--config'], 'r') as fd:
        filemap = yaml.load(fd)

    # convert to bucket name and path prefix
    config['BUCKET'] = bucket_info(config['BUCKET'])
    if config['SRCBUCKET']:
        config['SRCBUCKET'] = bucket_info(config['SRCBUCKET'])

    if config['--test'] and not config['SRCBUCKET']:
        s3 = None
    else:
        session = boto3.session.Session(profile_name=config['--profile'])
        s3 = session.client('s3')

    status = {'files': 0, 'size': 0, 'transferred': 0, 'errors': 0, 'countries': {}}
    if config['SRCBUCKET']:
        # scan a bucket
        params = {'Bucket': config['SRCBUCKET']['bucket'], 'Prefix': config['SRCBUCKET']['prefix'], 'MaxKeys': 100}
        response = {'IsTruncated': True}
        while response['IsTruncated']:
            response = s3.list_objects_v2(**params)
            params['ContinuationToken'] = response.get('NextContinuationToken')
            if response.get('Contents'):
                for elem in response['Contents']:
                    key = elem['Key']
                    filename = os.path.basename(key)
                    aws_key = s3key(filename)
                    if not aws_key:
                        continue

                    srckey = '{}/{}'.format(config['SRCBUCKET']['bucket'], key)
                    status['files'] += 1
                    status['size']  += elem['Size']
                    print 'Copying s3://{} to s3://{}/{}'.format(srckey, config['BUCKET']['bucket'], aws_key)
                    iso3 = country_id(filename)
                    if config['--test']:
                        tracker(status, iso3)
                    else:
                        try:
                            s3.copy_object(Bucket=config['BUCKET']['bucket'], CopySource=srckey, Key=aws_key)
                            status['transferred'] += elem['Size']
                            tracker(status, iso3)

                        except Exception as err:
                            sys.stderr.write(str(err) + '\n')
                            status['errors'] += 1
    else:
        # scan a local directory
        for curdir,subdirs,files in os.walk(config['--root']):
            for key in files:
                aws_key = s3key(key)
                if not aws_key:
                    continue

                fullpath = os.path.join(curdir, key)
                size     = os.path.getsize(fullpath)
                status['files'] += 1
                status['size']  += size
                print 'Uploading {} to s3://{}/{}'.format(fullpath, config['BUCKET']['bucket'], aws_key)
                iso3 = country_id(key)
                if config['--test']:
                    tracker(status, iso3)
                elif s3:
                    try:
                        s3.upload_file(fullpath, config['BUCKET']['bucket'], aws_key)
                        status['transferred'] += size
                        tracker(status, iso3)

                    except boto3.exceptions.S3UploadFailedError as err:
                        sys.stderr.write(str(err) + '\n')
                        status['errors'] += 1

    # summary
    print 'Done: files: {}, size: {}, transferred: {}, errors: {}'.format(status['files'], hsz(status['size']), hsz(status['transferred']), status['errors'])
    if config['--report']:
        keys = status['countries'].keys()
        keys.sort()
        print 'File counts by country:'
        for i in keys:
            print '  {} {}'.format(i, status['countries'][i])

def hsz(size):

    _size = size
    for unit in ['', 'Kb', 'Mb', 'Gb', 'Tb', 'Pb']:
        if abs(size) < 1024.0:
            return '{:.1f}{}'.format(size, unit)

        size /= 1024.0

    return _size

def s3key(filename):
    '''Returns the correct s3 key for filename, or None if it should not be processed
    '''

    global config, filemap, file_pattern_screen

    (base,ext) = os.path.splitext(filename)
    if ext.lower() != '.tif':
        return None

    # Example filename: "XXX YYY Amphibians Result-indstwisaac.tif"
    # or:               "XXX YYY Amphibians Result-indstwisaac_3.tif"
    # XXX and YYY should be a 3-character ASCII and numeric segment respectively
    # the remainder must match a pattern in the yaml file

    m = re.match(file_pattern_screen, filename)
    if not m or not filemap.get(m.group(3).lower()):
        if not config['--no-warnings']:
            sys.stderr.write('Unrecognized file name pattern: {}\n'.format(filename))

        return None

    suffix = m.group(4)
    return '{}{}/{}-{}{}.tif'.format(config['BUCKET']['prefix'], m.group(1), m.group(2), filemap[m.group(3).lower()], suffix if suffix else '')

def country_id(filename):

    global file_pattern_screen


    m = re.match(file_pattern_screen, filename)
    return m.group(1) if m else None


def tracker(status, iso3):

    if status['countries'].get(iso3):
        status['countries'][iso3] += 1
    else:
        status['countries'][iso3] = 1

def bucket_info(bucket):

    parts = bucket.split('/',1)
    if len(parts) < 2:
        parts.append('')

    if parts[1] and parts[1][-1:] != '/':
        parts[1] = parts[1] + '/'

    return {'bucket': parts[0], 'prefix': parts[1]}


if __name__=='__main__':
    main()

## terre-biodiv.yaml
all specie total count: allspecies-totalcount
amphibians.total count: amphibians-totalcount
birds.total count: birds-totalcount
mammals.total count: mammals-totalcount
reptiles.total count: reptiles-totalcount

all specie encr count: allspecies-encrcount
amphibians.encr count: amphibians-encrcount
birds.encr count: birds-encrcount
mammals.encr count: mammals-encrcount
reptiles.encr count: reptiles-encrcount

result-indstwisaac: allspecies-extisaac
amphibians result-indstwisaac: amphibians-extisaac
birds result-indstwisaac: birds-extisaac
mammals result-indstwisaac: mammals-extisaac
reptiles result-indstwisaac: reptiles-extisaac

result-indstwi50: allspecies-extmooers50
amphibians result-indstwi50: amphibians-extmooers50
birds result-indstwi50: birds-extmooers50
mammals result-indstwi50: mammals-extmooers50
reptiles result-indstwi50: reptiles-extmooers50

result-indstwi100: allspecies-extmooers100
amphibians result-indstwi100: amphibians-extmooers100
birds result-indstwi100: birds-extmooers100
mammals result-indstwi100: mammals-extmooers100
reptiles result-indstwi100: reptiles-extmooers100

result-indstwi500: allspecies-extmooers500
amphibians result-indstwi500: amphibians-extmooers500
birds result-indstwi500: birds-extmooers500
mammals result-indstwi500: mammals-extmooers500
reptiles result-indstwi500: reptiles-extmooers500

all specie total endem: allspecies-endemicity
amphibians.total endem: amphibians-endemicity
birds.total endem: birds-endemicity
mammals.total endem: mammals-endemicity
reptiles.total endem: reptiles-endemicity

ecoregions.total endem: ecoregion-vulnerability

	"""
	Upload files to a terre-biodiv s3 bucket

	The first form uploads files from a local directory to an S3 bucket

	The second form uses an S3 bucket as the source

	Either BUCKET or SRCBUCKET can include a path prefix to control the
	copy operation, e.g., wbg-terre-biodiv/data

	Usage:
	terre-biodiv.py [--config=YAML] [--test] [--report] [--no-warnings] [--profile=NAME] [--root=DIRECTORY] BUCKET
	terre-biodiv.py [--config=YAML] [--test] [--report] [--no-warnings] [--profile=NAME] SRCBUCKET BUCKET

	Options:
	--config=YAML Config file [default: terre-biodiv.yaml]
	--test Report operations only: don't upload
	--no-warnings Don't warn if non-matching files are encountered
	--profile=NAME AWS profile from .aws/credentials [default: default]
	--root=DIRECTORY Root directory to traverse [default: .]
	--report Provide a detailed report

	"""

	import os
	import yaml
	import boto3
	import re
	import sys
	from docopt import docopt

	config = docopt(__doc__)

	# file_pattern_screen determines which files/keys are recognized and copied
	file_pattern_screen = r'^(\w{3}) (\d{3}) ([^_]+)(_\d+)?.tif$'

	def main():
	global config, filemap

	# first load a filename remapping matrix
	with open(config['--config'], 'r') as fd:
	filemap = yaml.load(fd)

	# convert to bucket name and path prefix
	config['BUCKET'] = bucket_info(config['BUCKET'])
	if config['SRCBUCKET']:
	config['SRCBUCKET'] = bucket_info(config['SRCBUCKET'])

	if config['--test'] and not config['SRCBUCKET']:
	s3 = None
	else:
	session = boto3.session.Session(profile_name=config['--profile'])
	s3 = session.client('s3')

	status = {'files': 0, 'size': 0, 'transferred': 0, 'errors': 0, 'countries': {}}
	if config['SRCBUCKET']:
	# scan a bucket
	params = {'Bucket': config['SRCBUCKET']['bucket'], 'Prefix': config['SRCBUCKET']['prefix'], 'MaxKeys': 100}
	response = {'IsTruncated': True}
	while response['IsTruncated']:
	response = s3.list_objects_v2(**params)
	params['ContinuationToken'] = response.get('NextContinuationToken')
	if response.get('Contents'):
	for elem in response['Contents']:
	key = elem['Key']
	filename = os.path.basename(key)
	aws_key = s3key(filename)
	if not aws_key:
	continue

	srckey = '{}/{}'.format(config['SRCBUCKET']['bucket'], key)
	status['files'] += 1
	status['size'] += elem['Size']
	print 'Copying s3://{} to s3://{}/{}'.format(srckey, config['BUCKET']['bucket'], aws_key)
	iso3 = country_id(filename)
	if config['--test']:
	tracker(status, iso3)
	else:
	try:
	s3.copy_object(Bucket=config['BUCKET']['bucket'], CopySource=srckey, Key=aws_key)
	status['transferred'] += elem['Size']
	tracker(status, iso3)

	except Exception as err:
	sys.stderr.write(str(err) + '\n')
	status['errors'] += 1
	else:
	# scan a local directory
	for curdir,subdirs,files in os.walk(config['--root']):
	for key in files:
	aws_key = s3key(key)
	if not aws_key:
	continue

	fullpath = os.path.join(curdir, key)
	size = os.path.getsize(fullpath)
	status['files'] += 1
	status['size'] += size
	print 'Uploading {} to s3://{}/{}'.format(fullpath, config['BUCKET']['bucket'], aws_key)
	iso3 = country_id(key)
	if config['--test']:
	tracker(status, iso3)
	elif s3:
	try:
	s3.upload_file(fullpath, config['BUCKET']['bucket'], aws_key)
	status['transferred'] += size
	tracker(status, iso3)

	except boto3.exceptions.S3UploadFailedError as err:
	sys.stderr.write(str(err) + '\n')
	status['errors'] += 1

	# summary
	print 'Done: files: {}, size: {}, transferred: {}, errors: {}'.format(status['files'], hsz(status['size']), hsz(status['transferred']), status['errors'])
	if config['--report']:
	keys = status['countries'].keys()
	keys.sort()
	print 'File counts by country:'
	for i in keys:
	print ' {} {}'.format(i, status['countries'][i])

	def hsz(size):

	_size = size
	for unit in ['', 'Kb', 'Mb', 'Gb', 'Tb', 'Pb']:
	if abs(size) < 1024.0:
	return '{:.1f}{}'.format(size, unit)

	size /= 1024.0

	return _size

	def s3key(filename):
	'''Returns the correct s3 key for filename, or None if it should not be processed
	'''

	global config, filemap, file_pattern_screen

	(base,ext) = os.path.splitext(filename)
	if ext.lower() != '.tif':
	return None

	# Example filename: "XXX YYY Amphibians Result-indstwisaac.tif"
	# or: "XXX YYY Amphibians Result-indstwisaac_3.tif"
	# XXX and YYY should be a 3-character ASCII and numeric segment respectively
	# the remainder must match a pattern in the yaml file

	m = re.match(file_pattern_screen, filename)
	if not m or not filemap.get(m.group(3).lower()):
	if not config['--no-warnings']:
	sys.stderr.write('Unrecognized file name pattern: {}\n'.format(filename))

	return None

	suffix = m.group(4)
	return '{}{}/{}-{}{}.tif'.format(config['BUCKET']['prefix'], m.group(1), m.group(2), filemap[m.group(3).lower()], suffix if suffix else '')

	def country_id(filename):

	global file_pattern_screen


	m = re.match(file_pattern_screen, filename)
	return m.group(1) if m else None


	def tracker(status, iso3):

	if status['countries'].get(iso3):
	status['countries'][iso3] += 1
	else:
	status['countries'][iso3] = 1

	def bucket_info(bucket):

	parts = bucket.split('/',1)
	if len(parts) < 2:
	parts.append('')

	if parts[1] and parts[1][-1:] != '/':
	parts[1] = parts[1] + '/'

	return {'bucket': parts[0], 'prefix': parts[1]}



	if __name__=='__main__':
	main()
	all specie total count: allspecies-totalcount
	amphibians.total count: amphibians-totalcount
	birds.total count: birds-totalcount
	mammals.total count: mammals-totalcount
	reptiles.total count: reptiles-totalcount

	all specie encr count: allspecies-encrcount
	amphibians.encr count: amphibians-encrcount
	birds.encr count: birds-encrcount
	mammals.encr count: mammals-encrcount
	reptiles.encr count: reptiles-encrcount

	result-indstwisaac: allspecies-extisaac
	amphibians result-indstwisaac: amphibians-extisaac
	birds result-indstwisaac: birds-extisaac
	mammals result-indstwisaac: mammals-extisaac
	reptiles result-indstwisaac: reptiles-extisaac

	result-indstwi50: allspecies-extmooers50
	amphibians result-indstwi50: amphibians-extmooers50
	birds result-indstwi50: birds-extmooers50
	mammals result-indstwi50: mammals-extmooers50
	reptiles result-indstwi50: reptiles-extmooers50

	result-indstwi100: allspecies-extmooers100
	amphibians result-indstwi100: amphibians-extmooers100
	birds result-indstwi100: birds-extmooers100
	mammals result-indstwi100: mammals-extmooers100
	reptiles result-indstwi100: reptiles-extmooers100

	result-indstwi500: allspecies-extmooers500
	amphibians result-indstwi500: amphibians-extmooers500
	birds result-indstwi500: birds-extmooers500
	mammals result-indstwi500: mammals-extmooers500
	reptiles result-indstwi500: reptiles-extmooers500

	all specie total endem: allspecies-endemicity
	amphibians.total endem: amphibians-endemicity
	birds.total endem: birds-endemicity
	mammals.total endem: mammals-endemicity
	reptiles.total endem: reptiles-endemicity

	ecoregions.total endem: ecoregion-vulnerability