Skip to content

Instantly share code, notes, and snippets.

@h5rdly
Last active July 7, 2020 16:31
Show Gist options
  • Save h5rdly/30e3dbb28f46ca5fdc404155438017c9 to your computer and use it in GitHub Desktop.
Save h5rdly/30e3dbb28f46ca5fdc404155438017c9 to your computer and use it in GitHub Desktop.
Download a file from S3 using "vanilla" standard library Python
import hashlib, hmac, socket, ssl
from datetime import datetime
try:
from urlparse import urlsplit
except:
from urllib.parse import urlsplit
ALGORTHM = 'AWS4-HMAC-SHA256'
sign = lambda key, msg: hmac.new(key, msg.encode('utf-8'), hashlib.sha256).digest()
def sign_headers(headers, url, access_key, secret_key, region = 'us-east-1'):
method = 'GET'
# Get host and parsed datetime and date used by AWS
parsed_url = urlsplit(url)
host = parsed_url.netloc
date = datetime.utcnow()
aws_datetime = date.strftime("%Y%m%dT%H%M%SZ")
aws_date = date.strftime("%Y%m%d")
# Generate scope and scoped credential strings, and the signing key
scope = '/'.join([aws_date, region, 's3', 'aws4_request'])
credential = '/'.join([access_key, scope])
signing_key = sign(sign(sign(sign(('AWS4' + secret_key).encode('utf-8'), aws_date), region), 's3'), 'aws4_request')
# Fill up all headers except 'Authorization'
headers['Host'] = host
headers['X-Amz-Date'] = aws_datetime
headers['X-Amz-Content-Sha256'] = u'UNSIGNED-PAYLOAD'
# Format header keys and data for the upcoming AWS atrings
sorted_headers_string = ';'.join([header.lower().strip() for header in sorted(headers)])
canonical_header_list = [header.lower().strip() + ':' + str(headers[header]).strip() for header in sorted(headers)]
# Geenerate canonical request and string to be signed
prefix = [method, parsed_url.path, parsed_url.query]
suffix = ['', sorted_headers_string, u'UNSIGNED-PAYLOAD'] # '' to alow 2 '\n'
canonical_req = '\n'.join(prefix + canonical_header_list + suffix)
string_to_sign = '\n'.join([ALGORTHM, aws_datetime, scope, hashlib.sha256(canonical_req.encode('utf-8')).hexdigest()])
signature = hmac.new(signing_key, string_to_sign.encode('utf-8'), hashlib.sha256).hexdigest()
# Finally generate the Authoization header with signing string_to_sign
headers['Authorization'] = ALGORTHM + ' Credential=' + credential + ', ' + 'SignedHeaders=' + sorted_headers_string + ', ' + 'Signature=' + signature
return headers
def download_s3_chunk(bucket, key, start, end, access_key, secret_key, endpoint = 'https://s3.amazonaws.com', region = 'us-east-1'):
''' Download part of an S3 stored file using vanilla Python '''
headers = {'Range': 'bytes={}-{}'.format(start, end), 'User-Agent': 'ssup'}
headers = sign_headers(headers, endpoint, access_key, secret_key)
# Raw message to send via socket
s3_message_parts = ['GET {} HTTP/1.1',
'Host: {}',
'Connection: keep-alive',
'Accept-Encoding: gzip, deflate',
'Accept: */*',
'User-Agent: ssup',
'X-Amz-Content-Sha256: UNSIGNED-PAYLOAD',
'Range: bytes={}-{}',
'X-Amz-Date: {}',
'Authorization: {}',
'\r\n']
message_params = '/' + bucket + '/' + key, headers['Host'], start, end, headers['X-Amz-Date'], headers['Authorization']
s3_download_message = '\r\n'.join(s3_message_parts).format(message_params)
s = ssl.wrap_socket(socket.socket())
s.connect(('s3.amazonaws.com', 443))
s.sendall(s3_download_message)
#Implement proper retrieval loop
return s.recv(), s.recv()
if __name__=='__main__':
# Adjust to get arguments from command prompt
from sys import argv as args
# Credentials
access_key = 'access'
secret_key = 'secret'
# Bucket, key and location info
bucket = 'my_bucket'
key = 'my_key'
# Chunk of key to download
start = 20
end = 100
header, chunk = download_s3_chunk(bucket, key, start, end, access_key, secret_key)
@azzamsa
Copy link

azzamsa commented Sep 26, 2019

why don't you use request instead?

@h5rdly
Copy link
Author

h5rdly commented May 1, 2020

Not sure I understand, are you talking about using boto3?

I wanted to understand how the communication with S3 works conceptually, and boto was a bit heavy.

@h5rdly
Copy link
Author

h5rdly commented Jul 7, 2020

Sorry, I understand now you were referring to using socket. I wanted to go vanilla both to understand the exact mechanics, and to make sure it runs anywhere with Python. Perhaps urllib can be used here to save some code while not requiring anything outside of Python.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment