Skip to content

Instantly share code, notes, and snippets.

@HariSekhon
Forked from tomkinsc/compare_buckets.py
Created October 18, 2019 14:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save HariSekhon/e0a9732ce6d84839606525b2c5517b68 to your computer and use it in GitHub Desktop.
Save HariSekhon/e0a9732ce6d84839606525b2c5517b68 to your computer and use it in GitHub Desktop.
This is a quick and dirty script to compare two different s3-compatible buckets, just dorp in the bucket name and credentials, and optionally change the endpoint host
#!/usr/bin/python
import boto, json
from boto.s3.connection import S3Connection
from boto.gs.connection import GSConnection
def compare_buckets(bucket_one_bucket_name,
bucket_two_bucket_name,
bucket_one_access_key_id,
bucket_one_secret_access_key,
bucket_two_access_key_id,
bucket_two_secret_access_key,
bucket_one_host="s3.amazonaws.com",
bucket_two_host="s3.amazonaws.com",
print_status=False,
confirm_matches=False):
'''
This function compares the file contents of two S3-compatible
buckets, skipping multi-part uploads, and returning files that
are missing or that differ (via etag md5 hash).
It may be helpful for migrating from S3 to GS.
'''
bucket_one_conn = S3Connection(aws_access_key_id=bucket_one_access_key_id, aws_secret_access_key=bucket_one_secret_access_key, host=bucket_one_host)
bucket_two_conn = S3Connection(aws_access_key_id=bucket_two_access_key_id, aws_secret_access_key=bucket_two_secret_access_key, host=bucket_two_host)
bucket_one_bucket=bucket_one_conn.get_bucket(bucket_one_bucket_name)
bucket_two_bucket=bucket_two_conn.get_bucket(bucket_two_bucket_name)
results = {}
results["present"] = []
results["missing"] = []
results["different"] = []
results["skipped_bucket_one"] = []
results["skipped_bucket_two"] = []
bucket_one_keys = bucket_one_bucket.get_all_keys()
numer_of_keys = len(bucket_one_keys)
for i, bucket_one_key in enumerate(bucket_one_keys):
if print_status:
print( "Processing. {percent:.2%} complete.".format(percent=float(i+1)/float(numer_of_keys)) )
if not bucket_one_key.name.endswith("/"):
if "-" not in str(bucket_one_key.etag):
bucket_two_key = bucket_two_bucket.get_key(bucket_one_key.name)
if bucket_two_key:
if "-" not in str(bucket_two_key.etag):
if bucket_two_key.etag == bucket_one_key.etag:
if print_status and confirm_matches:
print("etag matches for:", bucket_one_key.name, bucket_one_key.etag, "==", bucket_two_key.etag)
results["present"].append(bucket_one_key.name)
pass
else:
if print_status:
print("etag differs for:", bucket_one_key.name, bucket_one_key.etag, "==", bucket_two_key.etag)
results["different"].append(bucket_one_key.name)
else:
if print_status:
print("multipart upload for file, skipping:", bucket_two_key.name)
results["skipped_bucket_two"].append(bucket_one_key.name)
else:
if print_status:
print("object does not exist in GS:", bucket_one_key.name)
results["missing"].append(bucket_one_key.name)
else:
if print_status:
print("multipart upload for file, skipping:", bucket_one_key.name)
results["skipped_bucket_one"].append(bucket_one_key.name)
return results
if __name__ == "__main__":
bucket_name = "bucket-name"
results = compare_buckets(
bucket_one_bucket_name = bucket_name,
bucket_two_bucket_name = bucket_name,
bucket_one_access_key_id = "",
bucket_one_secret_access_key = "",
bucket_two_access_key_id = "",
bucket_two_secret_access_key = "",
bucket_two_host = "storage.googleapis.com",
print_status = False,
confirm_matches = False)
print(json.dumps(results, sort_keys=True, indent=4, separators=(',', ': ')))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment