Skip to content

Instantly share code, notes, and snippets.

@metadaddy
Created July 19, 2022 00:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save metadaddy/1cd517b9cd7b5380a6298455e7188955 to your computer and use it in GitHub Desktop.
Save metadaddy/1cd517b9cd7b5380a6298455e7188955 to your computer and use it in GitHub Desktop.
Delete all files in a destination bucket that do not exist in a source bucket
import os
import sys
import boto3
from botocore.config import Config
from dotenv import load_dotenv
# Use -d command line option for a dry run - i.e. don't actually delete files
dry_run = len(sys.argv) == 2 and sys.argv[1] == '-d'
# Configure via environment variables or a .env file
# Never put credentials in code!
load_dotenv()
SRC_ENDPOINT_URL = os.environ['SRC_ENDPOINT_URL']
SRC_ACCESS_KEY = os.environ['SRC_ACCESS_KEY']
SRC_SECRET_KEY = os.environ['SRC_SECRET_KEY']
SRC_BUCKET = os.environ['SRC_BUCKET']
DST_ENDPOINT_URL = os.environ['DST_ENDPOINT_URL']
DST_ACCESS_KEY = os.environ['DST_ACCESS_KEY']
DST_SECRET_KEY = os.environ['DST_SECRET_KEY']
DST_BUCKET = os.environ['DST_BUCKET']
src_b2 = boto3.resource(service_name='s3',
endpoint_url=SRC_ENDPOINT_URL,
aws_access_key_id=SRC_ACCESS_KEY,
aws_secret_access_key=SRC_SECRET_KEY,
config=Config(
signature_version='s3v4',
))
dst_b2 = boto3.resource(service_name='s3',
endpoint_url=DST_ENDPOINT_URL,
aws_access_key_id=DST_ACCESS_KEY,
aws_secret_access_key=DST_SECRET_KEY,
config=Config(
signature_version='s3v4',
))
src_bucket = src_b2.Bucket(SRC_BUCKET)
dst_bucket = dst_b2.Bucket(DST_BUCKET)
# Read source file listing
#
# It's conceivable that this might provoke an out of memory error for buckets
# with many, many files!
#
# Use a set since lookup is MUCH faster than a list
src_files = set([obj.key for obj in src_bucket.objects.all()])
# Read destination file listing
#
# A list is fine here
dst_files = [obj.key for obj in dst_bucket.objects.all()]
# We want to delete all files in dst_files that are not in src_files
objects_to_delete = [{'Key': file} for file in dst_files if file not in src_files]
if len(objects_to_delete) > 0:
if dry_run:
# Just display names of files we would delete
for file in objects_to_delete:
print(file['Key'])
else:
# Delete the files
response = dst_bucket.delete_objects(
Delete={
'Objects': objects_to_delete,
'Quiet': False
}
)
# Report on each file
if 'Deleted' in response:
for obj in response['Deleted']:
print(obj['Key'])
if 'Errors' in response:
for obj in response['Errors']:
print(f'Error deleting {obj["Key"]}: {obj["Message"]}', file=sys.stderr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment