Last active
May 16, 2024 15:55
-
-
Save clayg/2bf9dd63d55c1864a2aa8bbee062bc97 to your computer and use it in GitHub Desktop.
Create an aws BadDigest exception with boto using a contrived file-like
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from argparse import ArgumentParser | |
import sys | |
import json | |
import datetime | |
import os | |
import logging | |
import hashlib | |
import boto3 | |
from botocore.config import Config | |
logging.basicConfig(level=logging.DEBUG) | |
parser = ArgumentParser() | |
parser.add_argument('bucket', help='bucket to make the upload') | |
parser.add_argument('key', help='key name for large object') | |
parser.add_argument('--size', type=int, default=4, | |
help='number of MiB to upload') | |
parser.add_argument('--break-after-seek-to-begining', | |
type=int, nargs='+', action='extend', default=[], | |
help="which iterations of reads on the file " | |
"will start returning invalid data. It's been observed " | |
"that aws-cli likes to read the local file twice " | |
"(once for md5 and also for sha256) before it reads " | |
"the file again to upload.") | |
parser.add_argument('--remove-content-md5', action='store_true', | |
help='normally boto will send a Content-MD5 hash TOO, ' | |
'but we can remove it using event hooks') | |
parser.add_argument('--attempts', default=0, type=int, | |
help='you should try adding some retires!') | |
parser.add_argument('--endpoint-url', default='http://saio:8080', | |
help='boto client endpoint_url') | |
parser.add_argument('--aws-access-key-id', default='test:tester', | |
help='boto client aws_access_key_id') | |
parser.add_argument('--aws-secret-access-key', default='testing', | |
help='boto client aws_secret_access_key') | |
EXPECTED_BUFFER = b'\x00' * 1048576 | |
INVALID_BUFFER = b'\x01' * 1048576 | |
def dumps(data): | |
def default(o): | |
if isinstance(o, (datetime.date, datetime.datetime)): | |
return o.isoformat() | |
print(json.dumps(data, indent=2, default=default)) | |
class CountingSeekable(object): | |
def __init__(self, opts): | |
self.opts = opts | |
self._cur_pos = 0 | |
self._read_amount = 0 | |
self._body_size = opts.size * 2 ** 20 | |
self._calls = [] | |
self._number_of_seek_to_begining = 0 | |
self._reset_hasher() | |
self._collected_hashes = [] | |
def _reset_hasher(self): | |
self._md5 = hashlib.md5() | |
self._sha256 = hashlib.sha256() | |
def _update_hasher(self, content): | |
for hasher in (self._md5, self._sha256): | |
hasher.update(content) | |
def _collect_hashes(self): | |
md5 = self._md5.hexdigest() | |
sha256 = self._sha256.hexdigest() | |
self._collected_hashes.append((md5, sha256)) | |
print('md5: %s' % md5) | |
print('sha256: %s' % sha256) | |
def read(self, amount): | |
self._calls.append(('read', amount)) | |
if (self._number_of_seek_to_begining in | |
self.opts.break_after_seek_to_begining): | |
buffer = EXPECTED_BUFFER | |
else: | |
buffer = INVALID_BUFFER | |
rv = buffer[:min(amount, self._body_size - self._cur_pos)] | |
self._cur_pos += len(rv) | |
self._read_amount += len(rv) | |
print('boto be read(%r) => %r %r/%r' % ( | |
amount, len(rv), | |
self._read_amount, self._body_size)) | |
self._update_hasher(rv) | |
return rv | |
def tell(self, *args, **kwargs): | |
self._calls.append(('tell', (args, kwargs))) | |
return self._cur_pos | |
def seek(self, offset, whence=0): | |
self._calls.append(('seek', (offset, whence))) | |
if whence == os.SEEK_SET: | |
self._read_amount = 0 | |
self._cur_pos = offset | |
elif whence == os.SEEK_CUR: | |
self._cur_pos += offset | |
elif whence == os.SEEK_END: | |
self._cur_pos = self._body_size - offset | |
else: | |
raise ValueError('this is all jank') | |
if (offset, whence) == (0, 0): | |
self._number_of_seek_to_begining += 1 | |
self._collect_hashes() | |
self._reset_hasher() | |
return self._cur_pos | |
def close(self): | |
self._collect_hashes() | |
self._calls.append(('close',)) | |
def remove_content_md5_header(request, **kwargs): | |
print(dict(request.headers)) | |
del request.headers['Content-MD5'] | |
def main(): | |
opts = parser.parse_args() | |
config = Config(retries={'max_attempts': opts.attempts}) | |
client = boto3.client('s3', config=config, endpoint_url=opts.endpoint_url, | |
aws_access_key_id=opts.aws_access_key_id, | |
aws_secret_access_key=opts.aws_secret_access_key) | |
try: | |
client.create_bucket(Bucket=opts.bucket) | |
except client.exceptions.BucketAlreadyOwnedByYou: | |
print('bucket exists') | |
if opts.remove_content_md5: | |
client.meta.events.register('before-sign.s3.PutObject', | |
remove_content_md5_header) | |
fileobj = CountingSeekable(opts) | |
try: | |
upload_resp = client.upload_fileobj(fileobj, opts.bucket, opts.key) | |
except Exception as e: | |
logging.exception('unable to upload', e) | |
import pdb | |
pdb.set_trace() | |
raise | |
dumps(upload_resp) | |
amt_read = 0 | |
read_count = 0 | |
for c in fileobj._calls: | |
if c[0] == 'read': | |
amt_read += c[1] | |
else: | |
if amt_read > 0: | |
print('read %s' % amt_read) | |
amt_read = 0 | |
read_count += 1 | |
print(c) | |
if amt_read > 0: | |
print('read %s' % amt_read) | |
for hashes in fileobj._collected_hashes[-read_count:]: | |
print('hashes (%s, %s)' % hashes) | |
if __name__ == "__main__": | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment