Skip to content

Instantly share code, notes, and snippets.

@PsycheShaman
Created September 13, 2021 06:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save PsycheShaman/8f40077aebfd3c7cbb9fd0f04dbae31c to your computer and use it in GitHub Desktop.
Save PsycheShaman/8f40077aebfd3c7cbb9fd0f04dbae31c to your computer and use it in GitHub Desktop.
Python function to unzip an archive stored in S3 to a specified destination folder in S3.
import boto3
import io
import zipfile
def unzip_s3_source_data(
s3_source_bucket: str,
s3_source_folder: str,
s3_destination_folder: str
) -> None:
"""
Convenience function to unzip a compressed datasource in S3
"""
print('*'*100)
print(f'Listing files in s3://{s3_source_bucket}\n')
print('='*100)
s3 = boto3.client('s3')
s3_files = s3.list_objects_v2(
Bucket=s3_source_bucket,
Prefix=s3_source_folder,
)
s3_files = [
s3_files['Contents'][i]['Key']
for i in range(len(s3_files['Contents']))
if s3_files['Contents'][i]['Size'] != 0
]
s3_source_files = '\n'.join(s3_files)
print(f'Source files in s3://{s3_source_bucket} are: {s3_source_files}\n')
print('='*100)
for raw_data_filename in s3_files:
if raw_data_filename.endswith('.zip'):
print(f'Downloading {raw_data_filename}\n')
print('='*100)
_bytes = io.BytesIO(
s3.get_object(
Bucket=s3_source_bucket,
Key=raw_data_filename
)['Body'].read()
)
_zip = zipfile.ZipFile(_bytes)
zip_file_names = [i.filename for i in _zip.infolist()]
filelist = '\n'.join(zip_file_names)
print(f'Zip archive contains:\n')
print(f'{filelist}\n')
print('='*100)
for f in zip_file_names:
print(f'Uploading unzipped file {f} to {s3_destination_folder}/{f}\n')
print('='*100)
try:
s3.put_object(
Body=_zip.read(f),
Bucket=s3_source_bucket,
Key=f'{s3_destination_folder}/{f}',
)
except Exception as e:
print(f'Uncaught exception in file upload: {e}\n')
print('!*'*100)
raise(e)
print('Success!')
print('*'*100)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment