Skip to content

Instantly share code, notes, and snippets.

@harrybiddle
Last active January 14, 2022 15:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save harrybiddle/bf9dc92788f49a7dea758304dd5d951a to your computer and use it in GitHub Desktop.
Save harrybiddle/bf9dc92788f49a7dea758304dd5d951a to your computer and use it in GitHub Desktop.
Compute an AWS S3 ETag locally and check if it matches the remote object
"""
This script fetches the ETag of an AWS S3 object. It then computes
the ETag locally to see if it matches.
From From https://teppen.io/2018/10/23/aws_s3_verify_etags/
"""
from typing import Optional
from hashlib import md5
import boto3
ONE_MEGABYTE = 1048576
def etag_matches_buffer(
buffer, client, bucket: str, key: str, version_id: Optional[str] = None
) -> bool:
# get the remote etag
response = client.head_object(
Bucket=bucket,
Key=key,
**({} if version_id is None else {"VersionId": version_id}),
)
remote_etag = response["ETag"].strip('"')
# figure out how many parts the object consists of (for multi-part uploads)
if "-" not in remote_etag:
# e.g. "bb01862f54c5347bc6be623f237836d5"
number_parts = 1
else:
# e.g. "bb01862f54c5347bc6be623f237836d5-2"
a, number_parts = remote_etag.split("-")
number_parts = int(number_parts)
# compute an etag for a variety of different multipliers (1MB, 2MB, etc) and
# see if any match
def local_etags():
# store location of buffer so that we can restore it later
p = buffer.tell()
# clients will upload files in chunks aligned to 1MB, or
# or maybe 2MB, or 5MB... We don't know which, so we try
# various different options
for m in [1, 2, 5, 8, 16]:
# calculate the size of each part of the multipart
# upload
content_length = int(response["ContentLength"])
a = content_length / number_parts
b = m * ONE_MEGABYTE
part_size = int(a + b - a % b)
# calculate the ETag
md5_digests = []
for chunk in iter(lambda: buffer.read(part_size), b""):
md5_digests.append(md5(chunk).digest())
yield md5(b"".join(md5_digests)).hexdigest() + "-" + str(len(md5_digests))
# reset buffer location for next attempt
buffer.seek(p)
# see whether they match
return any(remote_etag == local_etag for local_etag in local_etags())
client = boto3.client("s3")
with open("my_file.txt", "rb") as file:
matches = etag_matches_buffer(
file,
client,
"my_bucket",
"path/to/my_file.txt",
)
print(matches) # False/True
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment