Skip to content

Instantly share code, notes, and snippets.

@clayg
Last active May 16, 2024 15:55
Show Gist options
  • Save clayg/2bf9dd63d55c1864a2aa8bbee062bc97 to your computer and use it in GitHub Desktop.
Save clayg/2bf9dd63d55c1864a2aa8bbee062bc97 to your computer and use it in GitHub Desktop.
Create an aws BadDigest exception with boto using a contrived file-like
from argparse import ArgumentParser
import sys
import json
import datetime
import os
import logging
import hashlib
import boto3
from botocore.config import Config
logging.basicConfig(level=logging.DEBUG)
parser = ArgumentParser()
parser.add_argument('bucket', help='bucket to make the upload')
parser.add_argument('key', help='key name for large object')
parser.add_argument('--size', type=int, default=4,
help='number of MiB to upload')
parser.add_argument('--break-after-seek-to-begining',
type=int, nargs='+', action='extend', default=[],
help="which iterations of reads on the file "
"will start returning invalid data. It's been observed "
"that aws-cli likes to read the local file twice "
"(once for md5 and also for sha256) before it reads "
"the file again to upload.")
parser.add_argument('--remove-content-md5', action='store_true',
help='normally boto will send a Content-MD5 hash TOO, '
'but we can remove it using event hooks')
parser.add_argument('--attempts', default=0, type=int,
help='you should try adding some retires!')
parser.add_argument('--endpoint-url', default='http://saio:8080',
help='boto client endpoint_url')
parser.add_argument('--aws-access-key-id', default='test:tester',
help='boto client aws_access_key_id')
parser.add_argument('--aws-secret-access-key', default='testing',
help='boto client aws_secret_access_key')
EXPECTED_BUFFER = b'\x00' * 1048576
INVALID_BUFFER = b'\x01' * 1048576
def dumps(data):
def default(o):
if isinstance(o, (datetime.date, datetime.datetime)):
return o.isoformat()
print(json.dumps(data, indent=2, default=default))
class CountingSeekable(object):
def __init__(self, opts):
self.opts = opts
self._cur_pos = 0
self._read_amount = 0
self._body_size = opts.size * 2 ** 20
self._calls = []
self._number_of_seek_to_begining = 0
self._reset_hasher()
self._collected_hashes = []
def _reset_hasher(self):
self._md5 = hashlib.md5()
self._sha256 = hashlib.sha256()
def _update_hasher(self, content):
for hasher in (self._md5, self._sha256):
hasher.update(content)
def _collect_hashes(self):
md5 = self._md5.hexdigest()
sha256 = self._sha256.hexdigest()
self._collected_hashes.append((md5, sha256))
print('md5: %s' % md5)
print('sha256: %s' % sha256)
def read(self, amount):
self._calls.append(('read', amount))
if (self._number_of_seek_to_begining in
self.opts.break_after_seek_to_begining):
buffer = EXPECTED_BUFFER
else:
buffer = INVALID_BUFFER
rv = buffer[:min(amount, self._body_size - self._cur_pos)]
self._cur_pos += len(rv)
self._read_amount += len(rv)
print('boto be read(%r) => %r %r/%r' % (
amount, len(rv),
self._read_amount, self._body_size))
self._update_hasher(rv)
return rv
def tell(self, *args, **kwargs):
self._calls.append(('tell', (args, kwargs)))
return self._cur_pos
def seek(self, offset, whence=0):
self._calls.append(('seek', (offset, whence)))
if whence == os.SEEK_SET:
self._read_amount = 0
self._cur_pos = offset
elif whence == os.SEEK_CUR:
self._cur_pos += offset
elif whence == os.SEEK_END:
self._cur_pos = self._body_size - offset
else:
raise ValueError('this is all jank')
if (offset, whence) == (0, 0):
self._number_of_seek_to_begining += 1
self._collect_hashes()
self._reset_hasher()
return self._cur_pos
def close(self):
self._collect_hashes()
self._calls.append(('close',))
def remove_content_md5_header(request, **kwargs):
print(dict(request.headers))
del request.headers['Content-MD5']
def main():
opts = parser.parse_args()
config = Config(retries={'max_attempts': opts.attempts})
client = boto3.client('s3', config=config, endpoint_url=opts.endpoint_url,
aws_access_key_id=opts.aws_access_key_id,
aws_secret_access_key=opts.aws_secret_access_key)
try:
client.create_bucket(Bucket=opts.bucket)
except client.exceptions.BucketAlreadyOwnedByYou:
print('bucket exists')
if opts.remove_content_md5:
client.meta.events.register('before-sign.s3.PutObject',
remove_content_md5_header)
fileobj = CountingSeekable(opts)
try:
upload_resp = client.upload_fileobj(fileobj, opts.bucket, opts.key)
except Exception as e:
logging.exception('unable to upload', e)
import pdb
pdb.set_trace()
raise
dumps(upload_resp)
amt_read = 0
read_count = 0
for c in fileobj._calls:
if c[0] == 'read':
amt_read += c[1]
else:
if amt_read > 0:
print('read %s' % amt_read)
amt_read = 0
read_count += 1
print(c)
if amt_read > 0:
print('read %s' % amt_read)
for hashes in fileobj._collected_hashes[-read_count:]:
print('hashes (%s, %s)' % hashes)
if __name__ == "__main__":
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment