Skip to content

Instantly share code, notes, and snippets.

@tgherzog
Last active May 22, 2019 21:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tgherzog/57f316b1ad01021d2d0777a739105bd8 to your computer and use it in GitHub Desktop.
Save tgherzog/57f316b1ad01021d2d0777a739105bd8 to your computer and use it in GitHub Desktop.
terre-biodiv data uploader
"""
Upload files to a terre-biodiv s3 bucket
The first form uploads files from a local directory to an S3 bucket
The second form uses an S3 bucket as the source
Either BUCKET or SRCBUCKET can include a path prefix to control the
copy operation, e.g., wbg-terre-biodiv/data
Usage:
terre-biodiv.py [--config=YAML] [--test] [--report] [--no-warnings] [--profile=NAME] [--root=DIRECTORY] BUCKET
terre-biodiv.py [--config=YAML] [--test] [--report] [--no-warnings] [--profile=NAME] SRCBUCKET BUCKET
Options:
--config=YAML Config file [default: terre-biodiv.yaml]
--test Report operations only: don't upload
--no-warnings Don't warn if non-matching files are encountered
--profile=NAME AWS profile from .aws/credentials [default: default]
--root=DIRECTORY Root directory to traverse [default: .]
--report Provide a detailed report
"""
import os
import yaml
import boto3
import re
import sys
from docopt import docopt
config = docopt(__doc__)
# file_pattern_screen determines which files/keys are recognized and copied
file_pattern_screen = r'^(\w{3}) (\d{3}) ([^_]+)(_\d+)?.tif$'
def main():
global config, filemap
# first load a filename remapping matrix
with open(config['--config'], 'r') as fd:
filemap = yaml.load(fd)
# convert to bucket name and path prefix
config['BUCKET'] = bucket_info(config['BUCKET'])
if config['SRCBUCKET']:
config['SRCBUCKET'] = bucket_info(config['SRCBUCKET'])
if config['--test'] and not config['SRCBUCKET']:
s3 = None
else:
session = boto3.session.Session(profile_name=config['--profile'])
s3 = session.client('s3')
status = {'files': 0, 'size': 0, 'transferred': 0, 'errors': 0, 'countries': {}}
if config['SRCBUCKET']:
# scan a bucket
params = {'Bucket': config['SRCBUCKET']['bucket'], 'Prefix': config['SRCBUCKET']['prefix'], 'MaxKeys': 100}
response = {'IsTruncated': True}
while response['IsTruncated']:
response = s3.list_objects_v2(**params)
params['ContinuationToken'] = response.get('NextContinuationToken')
if response.get('Contents'):
for elem in response['Contents']:
key = elem['Key']
filename = os.path.basename(key)
aws_key = s3key(filename)
if not aws_key:
continue
srckey = '{}/{}'.format(config['SRCBUCKET']['bucket'], key)
status['files'] += 1
status['size'] += elem['Size']
print 'Copying s3://{} to s3://{}/{}'.format(srckey, config['BUCKET']['bucket'], aws_key)
iso3 = country_id(filename)
if config['--test']:
tracker(status, iso3)
else:
try:
s3.copy_object(Bucket=config['BUCKET']['bucket'], CopySource=srckey, Key=aws_key)
status['transferred'] += elem['Size']
tracker(status, iso3)
except Exception as err:
sys.stderr.write(str(err) + '\n')
status['errors'] += 1
else:
# scan a local directory
for curdir,subdirs,files in os.walk(config['--root']):
for key in files:
aws_key = s3key(key)
if not aws_key:
continue
fullpath = os.path.join(curdir, key)
size = os.path.getsize(fullpath)
status['files'] += 1
status['size'] += size
print 'Uploading {} to s3://{}/{}'.format(fullpath, config['BUCKET']['bucket'], aws_key)
iso3 = country_id(key)
if config['--test']:
tracker(status, iso3)
elif s3:
try:
s3.upload_file(fullpath, config['BUCKET']['bucket'], aws_key)
status['transferred'] += size
tracker(status, iso3)
except boto3.exceptions.S3UploadFailedError as err:
sys.stderr.write(str(err) + '\n')
status['errors'] += 1
# summary
print 'Done: files: {}, size: {}, transferred: {}, errors: {}'.format(status['files'], hsz(status['size']), hsz(status['transferred']), status['errors'])
if config['--report']:
keys = status['countries'].keys()
keys.sort()
print 'File counts by country:'
for i in keys:
print ' {} {}'.format(i, status['countries'][i])
def hsz(size):
_size = size
for unit in ['', 'Kb', 'Mb', 'Gb', 'Tb', 'Pb']:
if abs(size) < 1024.0:
return '{:.1f}{}'.format(size, unit)
size /= 1024.0
return _size
def s3key(filename):
'''Returns the correct s3 key for filename, or None if it should not be processed
'''
global config, filemap, file_pattern_screen
(base,ext) = os.path.splitext(filename)
if ext.lower() != '.tif':
return None
# Example filename: "XXX YYY Amphibians Result-indstwisaac.tif"
# or: "XXX YYY Amphibians Result-indstwisaac_3.tif"
# XXX and YYY should be a 3-character ASCII and numeric segment respectively
# the remainder must match a pattern in the yaml file
m = re.match(file_pattern_screen, filename)
if not m or not filemap.get(m.group(3).lower()):
if not config['--no-warnings']:
sys.stderr.write('Unrecognized file name pattern: {}\n'.format(filename))
return None
suffix = m.group(4)
return '{}{}/{}-{}{}.tif'.format(config['BUCKET']['prefix'], m.group(1), m.group(2), filemap[m.group(3).lower()], suffix if suffix else '')
def country_id(filename):
global file_pattern_screen
m = re.match(file_pattern_screen, filename)
return m.group(1) if m else None
def tracker(status, iso3):
if status['countries'].get(iso3):
status['countries'][iso3] += 1
else:
status['countries'][iso3] = 1
def bucket_info(bucket):
parts = bucket.split('/',1)
if len(parts) < 2:
parts.append('')
if parts[1] and parts[1][-1:] != '/':
parts[1] = parts[1] + '/'
return {'bucket': parts[0], 'prefix': parts[1]}
if __name__=='__main__':
main()
all specie total count: allspecies-totalcount
amphibians.total count: amphibians-totalcount
birds.total count: birds-totalcount
mammals.total count: mammals-totalcount
reptiles.total count: reptiles-totalcount
all specie encr count: allspecies-encrcount
amphibians.encr count: amphibians-encrcount
birds.encr count: birds-encrcount
mammals.encr count: mammals-encrcount
reptiles.encr count: reptiles-encrcount
result-indstwisaac: allspecies-extisaac
amphibians result-indstwisaac: amphibians-extisaac
birds result-indstwisaac: birds-extisaac
mammals result-indstwisaac: mammals-extisaac
reptiles result-indstwisaac: reptiles-extisaac
result-indstwi50: allspecies-extmooers50
amphibians result-indstwi50: amphibians-extmooers50
birds result-indstwi50: birds-extmooers50
mammals result-indstwi50: mammals-extmooers50
reptiles result-indstwi50: reptiles-extmooers50
result-indstwi100: allspecies-extmooers100
amphibians result-indstwi100: amphibians-extmooers100
birds result-indstwi100: birds-extmooers100
mammals result-indstwi100: mammals-extmooers100
reptiles result-indstwi100: reptiles-extmooers100
result-indstwi500: allspecies-extmooers500
amphibians result-indstwi500: amphibians-extmooers500
birds result-indstwi500: birds-extmooers500
mammals result-indstwi500: mammals-extmooers500
reptiles result-indstwi500: reptiles-extmooers500
all specie total endem: allspecies-endemicity
amphibians.total endem: amphibians-endemicity
birds.total endem: birds-endemicity
mammals.total endem: mammals-endemicity
reptiles.total endem: reptiles-endemicity
ecoregions.total endem: ecoregion-vulnerability
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment