import argparse | |
import os | |
import boto3 | |
class S3MultipartUpload(object): | |
# AWS throws EntityTooSmall error for parts smaller than 5 MB | |
PART_MINIMUM = int(5e6) | |
def __init__(self, | |
bucket, | |
key, | |
local_path, | |
part_size=int(15e6), | |
profile_name=None, | |
region_name="eu-west-1", | |
verbose=False): | |
self.bucket = bucket | |
self.key = key | |
self.path = local_path | |
self.total_bytes = os.stat(local_path).st_size | |
self.part_bytes = part_size | |
assert part_size > self.PART_MINIMUM | |
assert (self.total_bytes % part_size == 0 | |
or self.total_bytes % part_size > self.PART_MINIMUM) | |
self.s3 = boto3.session.Session( | |
profile_name=profile_name, region_name=region_name).client("s3") | |
if verbose: | |
boto3.set_stream_logger(name="botocore") | |
def abort_all(self): | |
mpus = self.s3.list_multipart_uploads(Bucket=self.bucket) | |
aborted = [] | |
print("Aborting", len(mpus), "uploads") | |
if "Uploads" in mpus: | |
for u in mpus["Uploads"]: | |
upload_id = u["UploadId"] | |
aborted.append( | |
self.s3.abort_multipart_upload( | |
Bucket=self.bucket, Key=self.key, UploadId=upload_id)) | |
return aborted | |
def create(self): | |
mpu = self.s3.create_multipart_upload(Bucket=self.bucket, Key=self.key) | |
mpu_id = mpu["UploadId"] | |
return mpu_id | |
def upload(self, mpu_id): | |
parts = [] | |
uploaded_bytes = 0 | |
with open(self.path, "rb") as f: | |
i = 1 | |
while True: | |
data = f.read(self.part_bytes) | |
if not len(data): | |
break | |
part = self.s3.upload_part( | |
Body=data, Bucket=self.bucket, Key=self.key, UploadId=mpu_id, PartNumber=i) | |
parts.append({"PartNumber": i, "ETag": part["ETag"]}) | |
uploaded_bytes += len(data) | |
print("{0} of {1} uploaded ({2:.3f}%)".format( | |
uploaded_bytes, self.total_bytes, | |
as_percent(uploaded_bytes, self.total_bytes))) | |
i += 1 | |
return parts | |
def complete(self, mpu_id, parts): | |
result = self.s3.complete_multipart_upload( | |
Bucket=self.bucket, | |
Key=self.key, | |
UploadId=mpu_id, | |
MultipartUpload={"Parts": parts}) | |
return result | |
# Helper | |
def as_percent(num, denom): | |
return float(num) / float(denom) * 100.0 | |
def parse_args(): | |
parser = argparse.ArgumentParser(description='Multipart upload') | |
parser.add_argument('--bucket', required=True) | |
parser.add_argument('--key', required=True) | |
parser.add_argument('--path', required=True) | |
parser.add_argument('--region', default="eu-west-1") | |
parser.add_argument('--profile', default=None) | |
return parser.parse_args() | |
def main(): | |
args = parse_args() | |
mpu = S3MultipartUpload( | |
args.bucket, | |
args.key, | |
args.path, | |
profile_name=args.profile, | |
region_name=args.region) | |
# abort all multipart uploads for this bucket (optional, for starting over) | |
mpu.abort_all() | |
# create new multipart upload | |
mpu_id = mpu.create() | |
# upload parts | |
parts = mpu.upload(mpu_id) | |
# complete multipart upload | |
print(mpu.complete(mpu_id, parts)) | |
if __name__ == "__main__": | |
main() |
This comment has been minimized.
This comment has been minimized.
We are working off your code for a Lambda function that pulls data from an FTP site, caches in memory and uploads chunks in a multipart, based on your code. We notice that for large files (1GB for eg) the upload process repeats. We are thinking maybe a part fails? |
This comment has been minimized.
This comment has been minimized.
Thanks a lot for this gist, it has been a fantastic resource for me. |
This comment has been minimized.
This comment has been minimized.
Great job! It will be very helpful to me. |
This comment has been minimized.
This comment has been minimized.
super, A+++, |
This comment has been minimized.
This comment has been minimized.
Thanks a lot, this has been most useful! I have created a modified version able to resume the upload after a failure, useful if the network fails or your session credentials expire. |
This comment has been minimized.
This comment has been minimized.
This is super helpful and very clean, thanks. |
This comment has been minimized.
This comment has been minimized.
How would this be modified to generated a presigned URL? I'm able to generate one, but it has a signature verification error, so I'm thinking that I'm missing something that sets the algorithm / version. Here are details if anyone can help! I'm trying to use the s3 boto3 client for a minio server for multipart upload with a presigned url because the minio-py doesn't support that. Update - I think I figured out how to add the key - the config parameter below is newly added from botocore.client import Config
...
s3_external = session.client(
"s3",
use_ssl=MINIO_SSL,
region_name=MINIO_REGION,
endpoint_url=MINIO_HTTP_PREFIX + MINIO_EXTERNAL_SERVER,
verify=False,
config=Config(signature_version='s3v4'),
) The signed url generated now has the (previously missing) algorithm, etc. headers, however the signature doesn't match, so I'm wondering if the key generated by the client (Singularity / Sylabs scs-library-client) is different than what I am specifying - that almost must be it... Update: i think the issue is that the signature includes the host, which is different inside (minio:9000) as opposed to outside (127.0.0.1:9000) the container, reading this post. boto/boto3#1982 (comment) |
This comment has been minimized.
This comment has been minimized.
Nice brother, great great job |
This comment has been minimized.
This comment has been minimized.
@balukrishnans are you talking to me or @teasherm? To follow up with my question above for future lurkers, it was a non-trivial thing that wound up needing a PR to the Minio Python client. Details about my particular implementation are here. And if you are referencing @teasherm, I agree, great job and thank you for posting this! |
This comment has been minimized.
This comment has been minimized.
How to get the content type of object after complete action? |
This comment has been minimized.
This comment has been minimized.
Thank you for writing/posting this. I'm pretty sure this is the only way to nicely do a multipart and also have the ability to have amazon verify the md5-sum(if you add that bit to the upload that is). One point:
isn't quite right thought as the last part can certainly be under the aws minimum for part you can verify that the cli does this often by verifying the etag against the combined md5 of each part. |
This comment has been minimized.
This comment has been minimized.
This is a gem. It's crazy how there's barely any documentation on this stuff |
This comment has been minimized.
had to roll a custom multipart upload (awscli erroring out on long upload with faulty network), and found boto3 multipart upload poorly documented so storing example code here