Skip to content

Instantly share code, notes, and snippets.

@tomkinsc
Last active March 27, 2023 11:11
Show Gist options
  • Save tomkinsc/d5e5f311d4d4ebfd726b to your computer and use it in GitHub Desktop.
Save tomkinsc/d5e5f311d4d4ebfd726b to your computer and use it in GitHub Desktop.
This is a quick and dirty script to compare two different s3-compatible buckets, just dorp in the bucket name and credentials, and optionally change the endpoint host
#!/usr/bin/python
import boto, json
from boto.s3.connection import S3Connection
from boto.gs.connection import GSConnection
def compare_buckets(bucket_one_bucket_name,
bucket_two_bucket_name,
bucket_one_access_key_id,
bucket_one_secret_access_key,
bucket_two_access_key_id,
bucket_two_secret_access_key,
bucket_one_host="s3.amazonaws.com",
bucket_two_host="s3.amazonaws.com",
print_status=False,
confirm_matches=False):
'''
This function compares the file contents of two S3-compatible
buckets, skipping multi-part uploads, and returning files that
are missing or that differ (via etag md5 hash).
It may be helpful for migrating from S3 to GS.
'''
bucket_one_conn = S3Connection(aws_access_key_id=bucket_one_access_key_id, aws_secret_access_key=bucket_one_secret_access_key, host=bucket_one_host)
bucket_two_conn = S3Connection(aws_access_key_id=bucket_two_access_key_id, aws_secret_access_key=bucket_two_secret_access_key, host=bucket_two_host)
bucket_one_bucket=bucket_one_conn.get_bucket(bucket_one_bucket_name)
bucket_two_bucket=bucket_two_conn.get_bucket(bucket_two_bucket_name)
results = {}
results["present"] = []
results["missing"] = []
results["different"] = []
results["skipped_bucket_one"] = []
results["skipped_bucket_two"] = []
bucket_one_keys = bucket_one_bucket.get_all_keys()
numer_of_keys = len(bucket_one_keys)
for i, bucket_one_key in enumerate(bucket_one_keys):
if print_status:
print( "Processing. {percent:.2%} complete.".format(percent=float(i+1)/float(numer_of_keys)) )
if not bucket_one_key.name.endswith("/"):
if "-" not in str(bucket_one_key.etag):
bucket_two_key = bucket_two_bucket.get_key(bucket_one_key.name)
if bucket_two_key:
if "-" not in str(bucket_two_key.etag):
if bucket_two_key.etag == bucket_one_key.etag:
if print_status and confirm_matches:
print("etag matches for:", bucket_one_key.name, bucket_one_key.etag, "==", bucket_two_key.etag)
results["present"].append(bucket_one_key.name)
pass
else:
if print_status:
print("etag differs for:", bucket_one_key.name, bucket_one_key.etag, "==", bucket_two_key.etag)
results["different"].append(bucket_one_key.name)
else:
if print_status:
print("multipart upload for file, skipping:", bucket_two_key.name)
results["skipped_bucket_two"].append(bucket_one_key.name)
else:
if print_status:
print("object does not exist in GS:", bucket_one_key.name)
results["missing"].append(bucket_one_key.name)
else:
if print_status:
print("multipart upload for file, skipping:", bucket_one_key.name)
results["skipped_bucket_one"].append(bucket_one_key.name)
return results
if __name__ == "__main__":
bucket_name = "bucket-name"
results = compare_buckets(
bucket_one_bucket_name = bucket_name,
bucket_two_bucket_name = bucket_name,
bucket_one_access_key_id = "",
bucket_one_secret_access_key = "",
bucket_two_access_key_id = "",
bucket_two_secret_access_key = "",
bucket_two_host = "storage.googleapis.com",
print_status = False,
confirm_matches = False)
print(json.dumps(results, sort_keys=True, indent=4, separators=(',', ': ')))
@akshat0047
Copy link

This script is cool but it could be improved for JSON files as the etag changes with changing order of keys inside json.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment