Skip to content

Instantly share code, notes, and snippets.

@davidejones
Last active April 13, 2022 16:27
Show Gist options
  • Save davidejones/9ea69eb94650fbb6697f2f1904946d11 to your computer and use it in GitHub Desktop.
Save davidejones/9ea69eb94650fbb6697f2f1904946d11 to your computer and use it in GitHub Desktop.
Updates s3 metadata have an md5chksum
import argparse
import hashlib
import logging
import tempfile
import boto3
def md5(file_handle, block_size=4096):
"""
Takes a file handle and creates a md5 hash
:param file_handle: file handle
:param block_size: the size of the chunks to read
returns md5 hash of file
"""
hash_md5 = hashlib.md5()
for block in iter(lambda: file_handle.read(block_size), b""):
hash_md5.update(block)
return hash_md5.hexdigest()
def update_md5_meta(args, client, key):
"""
Downloads the file from s3 and generates a md5 hash
Then updates the same file in s3 to have the md5sum as meta data
Note that updating using copy_object with replace directive has the side effect of fixing the ETag anyway
:param args: cli args bucket name, dryrun etc.
:param client: s3 boto3 client
:param key: name of s3 key we are interacting with
"""
obj = client.head_object(Bucket=args.bucket, Key=key)
current_etag = obj['ETag']
metadata = obj["Metadata"]
if not args.dryrun:
with tempfile.TemporaryFile(mode='w+b') as file_handle:
client.download_fileobj(args.bucket, key, file_handle)
file_handle.seek(0)
metadata["md5chksum"] = md5(file_handle)
response = client.copy_object(Bucket=args.bucket, Key=key, CopySource=args.bucket + '/' + key,
Metadata=metadata, ContentType=obj['ContentType'], MetadataDirective='REPLACE')
etag = response.get('CopyObjectResult', {}).get('ETag', '')
logging.info(f"Updated '{key}' md5 meta to {metadata['md5chksum']} ETag changed from {current_etag} to {etag}")
else:
logging.info(f"Skipping '{key}' copy object in dry run")
def main(args):
"""
Finds all files in bucket that are uploaded in multipart and updates their metadata to have an md5sum
:param args: cli args bucket name, dryrun etc.
"""
client = boto3.client('s3')
paginator = client.get_paginator('list_objects')
page_iterator = paginator.paginate(Bucket=args.bucket, Prefix=args.prefix)
# If the entity tag is not an MD5 digest of the object data, it will contain one or more
# nonhexadecimal characters and/or will consist of less than 32 or more than 32 hexadecimal digits.
filtered_iterator = page_iterator.search("Contents[?contains(ETag, '-')]")
for key_data in filtered_iterator:
update_md5_meta(args, client, key_data['Key'])
logging.info("DONE!")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Updates md5sum metadata on s3 objects')
parser.add_argument('bucket', help='name of bucket')
parser.add_argument('-p', '--prefix', default="", help='prefix path in bucket to work on')
parser.add_argument('--dryrun', action='store_true', help='dry run mode is read only no modifications')
options = parser.parse_args()
fmt = f"[DRYRUN] {logging.BASIC_FORMAT}" if options.dryrun else logging.BASIC_FORMAT
logging.basicConfig(level=logging.INFO, format=fmt)
main(options)
@davidejones
Copy link
Author

Usage

Running on all files in a bucket

aws-vault exec some-account -- python3 ./update_metadata.py my-bucket-name

Running on all files in a bucket under a certain subdirectory

aws-vault exec some-account -- python3 ./update_metadata.py my-bucket-name --prefix website-hugo/video/

Running on a single file

aws-vault exec some-account -- python3 ./update_metadata.py my-bucket-name --prefix website-hugo/video/my-vid.mp4

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment