Skip to content

Instantly share code, notes, and snippets.

@amacal
Last active November 11, 2020 08:32
Show Gist options
  • Save amacal/4a8a2661308df55fce63b97b1d3c948d to your computer and use it in GitHub Desktop.
Save amacal/4a8a2661308df55fce63b97b1d3c948d to your computer and use it in GitHub Desktop.
import boto3
from ftplib import FTP
parts = list()
chunksize = 32 * 1024 * 1024
filename = 'enwiki-20201020-langlinks.sql.gz'
s3client = boto3.client('s3')
ftp = FTP('ftp.acc.umu.se')
ftp.login()
ftp.cwd('mirror/wikimedia.org/dumps/enwiki/20201020/')
part = 1
buffer = bytearray(0)
upload = s3client.create_multipart_upload(Bucket='la-labs-279215538049', Key=filename)
print(f'Upload started {upload["UploadId"]} ...')
def append_chunk(chunk):
global buffer
buffer += chunk
upload_chunk() if len(buffer) >= chunksize else None
def upload_chunk():
global part, buffer
response = s3client.upload_part(Bucket='la-labs-279215538049', Key=filename, UploadId=upload['UploadId'], PartNumber=part, Body=buffer[:chunksize])
parts.append({'ETag':response['ETag'], 'PartNumber':part})
print(f'Part {part} completed with {response["ETag"]}')
buffer = buffer[chunksize:]
part = part + 1
ftp.retrbinary(f'RETR {filename}', append_chunk, blocksize=chunksize)
upload_chunk() if len(buffer) > 0 else None
s3client.complete_multipart_upload(Bucket='la-labs-279215538049', Key=filename, UploadId=upload['UploadId'], MultipartUpload={'Parts':parts})
print(f'Upload completed {upload["UploadId"]}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment