Last active
March 28, 2024 04:18
-
-
Save mrtj/df487b92aa7d70f05c9fdf78b0fc80f3 to your computer and use it in GitHub Desktop.
Download a file and stream it directly to an #aws S3 bucket without saving it locally
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import boto3 | |
import requests | |
import tqdm | |
from urllib.parse import urlparse | |
def stream_to_s3( | |
source_url, | |
target_url, | |
http_method='GET', | |
multipart_threshold=1024**2, | |
max_concurrency=10, | |
**kwargs | |
): | |
session = requests.Session() | |
response = session.request(http_method, source_url, stream=True, **kwargs) | |
object_size = int(response.headers.get('content-length', 0)) | |
s3 = boto3.client('s3') | |
parsed = urlparse(target_url) | |
target_bucket, target_key = parsed.hostname, parsed.path.lstrip('/') | |
target_filename = target_key.split('/')[-1] | |
with response as part, tqdm.tqdm(total=object_size, unit="B", unit_scale=True, desc=target_filename) as pbar: | |
part.raw.decode_content = True | |
conf = boto3.s3.transfer.TransferConfig( | |
multipart_threshold=multipart_threshold, | |
max_concurrency=max_concurrency | |
) | |
s3.upload_fileobj( | |
part.raw, | |
target_bucket, | |
target_key, | |
Config=conf, | |
Callback=lambda bytes_transferred: pbar.update(bytes_transferred) | |
) | |
if __name__ == '__main__': | |
import argparse | |
parser = argparse.ArgumentParser(description="Stream a file from http download to S3") | |
parser.add_argument("source_url", type=str, help="source http url") | |
parser.add_argument("target_url", type=str, help="target s3 url") | |
args = parser.parse_args() | |
stream_to_s3(args.source_url, args.target_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This script downloads a file from a web server and streams it directly to an AWS S3 bucket without saving it locally. It is best used for example in AWS CloudShell or in Lambda functions.
The code was adapted from https://gist.github.com/amalgjose/9007f5aac9e9751d595a5232fa3dd6bf#file-stream_to_s3-py.
It depends on a few well-known libraries that you can install with
pip install boto3 requests tqdm
.