Last active
January 14, 2022 15:22
-
-
Save harrybiddle/bf9dc92788f49a7dea758304dd5d951a to your computer and use it in GitHub Desktop.
Compute an AWS S3 ETag locally and check if it matches the remote object
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script fetches the ETag of an AWS S3 object. It then computes | |
the ETag locally to see if it matches. | |
From From https://teppen.io/2018/10/23/aws_s3_verify_etags/ | |
""" | |
from typing import Optional | |
from hashlib import md5 | |
import boto3 | |
ONE_MEGABYTE = 1048576 | |
def etag_matches_buffer( | |
buffer, client, bucket: str, key: str, version_id: Optional[str] = None | |
) -> bool: | |
# get the remote etag | |
response = client.head_object( | |
Bucket=bucket, | |
Key=key, | |
**({} if version_id is None else {"VersionId": version_id}), | |
) | |
remote_etag = response["ETag"].strip('"') | |
# figure out how many parts the object consists of (for multi-part uploads) | |
if "-" not in remote_etag: | |
# e.g. "bb01862f54c5347bc6be623f237836d5" | |
number_parts = 1 | |
else: | |
# e.g. "bb01862f54c5347bc6be623f237836d5-2" | |
a, number_parts = remote_etag.split("-") | |
number_parts = int(number_parts) | |
# compute an etag for a variety of different multipliers (1MB, 2MB, etc) and | |
# see if any match | |
def local_etags(): | |
# store location of buffer so that we can restore it later | |
p = buffer.tell() | |
# clients will upload files in chunks aligned to 1MB, or | |
# or maybe 2MB, or 5MB... We don't know which, so we try | |
# various different options | |
for m in [1, 2, 5, 8, 16]: | |
# calculate the size of each part of the multipart | |
# upload | |
content_length = int(response["ContentLength"]) | |
a = content_length / number_parts | |
b = m * ONE_MEGABYTE | |
part_size = int(a + b - a % b) | |
# calculate the ETag | |
md5_digests = [] | |
for chunk in iter(lambda: buffer.read(part_size), b""): | |
md5_digests.append(md5(chunk).digest()) | |
yield md5(b"".join(md5_digests)).hexdigest() + "-" + str(len(md5_digests)) | |
# reset buffer location for next attempt | |
buffer.seek(p) | |
# see whether they match | |
return any(remote_etag == local_etag for local_etag in local_etags()) | |
client = boto3.client("s3") | |
with open("my_file.txt", "rb") as file: | |
matches = etag_matches_buffer( | |
file, | |
client, | |
"my_bucket", | |
"path/to/my_file.txt", | |
) | |
print(matches) # False/True |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment