Created
July 19, 2022 00:54
-
-
Save metadaddy/1cd517b9cd7b5380a6298455e7188955 to your computer and use it in GitHub Desktop.
Delete all files in a destination bucket that do not exist in a source bucket
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import boto3 | |
from botocore.config import Config | |
from dotenv import load_dotenv | |
# Use -d command line option for a dry run - i.e. don't actually delete files | |
dry_run = len(sys.argv) == 2 and sys.argv[1] == '-d' | |
# Configure via environment variables or a .env file | |
# Never put credentials in code! | |
load_dotenv() | |
SRC_ENDPOINT_URL = os.environ['SRC_ENDPOINT_URL'] | |
SRC_ACCESS_KEY = os.environ['SRC_ACCESS_KEY'] | |
SRC_SECRET_KEY = os.environ['SRC_SECRET_KEY'] | |
SRC_BUCKET = os.environ['SRC_BUCKET'] | |
DST_ENDPOINT_URL = os.environ['DST_ENDPOINT_URL'] | |
DST_ACCESS_KEY = os.environ['DST_ACCESS_KEY'] | |
DST_SECRET_KEY = os.environ['DST_SECRET_KEY'] | |
DST_BUCKET = os.environ['DST_BUCKET'] | |
src_b2 = boto3.resource(service_name='s3', | |
endpoint_url=SRC_ENDPOINT_URL, | |
aws_access_key_id=SRC_ACCESS_KEY, | |
aws_secret_access_key=SRC_SECRET_KEY, | |
config=Config( | |
signature_version='s3v4', | |
)) | |
dst_b2 = boto3.resource(service_name='s3', | |
endpoint_url=DST_ENDPOINT_URL, | |
aws_access_key_id=DST_ACCESS_KEY, | |
aws_secret_access_key=DST_SECRET_KEY, | |
config=Config( | |
signature_version='s3v4', | |
)) | |
src_bucket = src_b2.Bucket(SRC_BUCKET) | |
dst_bucket = dst_b2.Bucket(DST_BUCKET) | |
# Read source file listing | |
# | |
# It's conceivable that this might provoke an out of memory error for buckets | |
# with many, many files! | |
# | |
# Use a set since lookup is MUCH faster than a list | |
src_files = set([obj.key for obj in src_bucket.objects.all()]) | |
# Read destination file listing | |
# | |
# A list is fine here | |
dst_files = [obj.key for obj in dst_bucket.objects.all()] | |
# We want to delete all files in dst_files that are not in src_files | |
objects_to_delete = [{'Key': file} for file in dst_files if file not in src_files] | |
if len(objects_to_delete) > 0: | |
if dry_run: | |
# Just display names of files we would delete | |
for file in objects_to_delete: | |
print(file['Key']) | |
else: | |
# Delete the files | |
response = dst_bucket.delete_objects( | |
Delete={ | |
'Objects': objects_to_delete, | |
'Quiet': False | |
} | |
) | |
# Report on each file | |
if 'Deleted' in response: | |
for obj in response['Deleted']: | |
print(obj['Key']) | |
if 'Errors' in response: | |
for obj in response['Errors']: | |
print(f'Error deleting {obj["Key"]}: {obj["Message"]}', file=sys.stderr) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment