Skip to content

Instantly share code, notes, and snippets.

@alukach
Last active October 6, 2023 18:40
Show Gist options
  • Save alukach/1a2b8b6366410fb94fa5cee7f72ee304 to your computer and use it in GitHub Desktop.
Save alukach/1a2b8b6366410fb94fa5cee7f72ee304 to your computer and use it in GitHub Desktop.
Parsing S3 Inventory results in Python
#! /usr/bin/env python3
"""
A utility to stream records from one or many S3 Inventory reports, with a progress bar.
./parse-inventory-progress s3://my-bucket/path/to/my/inventory/2019-12-15T00-00Z/manifest.json > out.csv
"""
import json
import csv
import gzip
import sys
import urllib.parse
import boto3
from tqdm import tqdm
s3 = boto3.resource('s3')
def list_keys(bucket, manifest_key):
manifest = json.load(s3.Object(bucket, manifest_key).get()['Body'])
for obj in manifest['files']:
gzip_obj = s3.Object(bucket_name=bucket, key=obj['key'])
buffer = gzip.open(gzip_obj.get()["Body"], mode='rt')
reader = csv.reader(buffer)
for row in reader:
yield row
if __name__ == '__main__':
"""
Call with S3 urls.
e.g. ./parse-inventory s3://my-bucket/path/to/my/inventory/2019-12-15T00-00Z/manifest.json
"""
for s3_url in sys.argv[1:]:
url = urllib.parse.urlparse(s3_url)
rows = tqdm(
list_keys(url.hostname, url.path.lstrip('/')),
desc=s3_url,
dynamic_ncols=True,
)
for bucket, key, *rest in rows:
print(bucket, key, *rest)
#! /usr/bin/env python3
"""
A utility to stream records from one or many S3 Inventory reports.
./parse-inventory s3://my-bucket/path/to/my/inventory/2019-12-15T00-00Z/manifest.json > out.csv
"""
import json
import csv
import gzip
import sys
import urllib.parse
import boto3
s3 = boto3.resource('s3')
def list_keys(bucket, manifest_key):
manifest = json.load(s3.Object(bucket, manifest_key).get()['Body'])
for obj in manifest['files']:
gzip_obj = s3.Object(bucket_name=bucket, key=obj['key'])
buffer = gzip.open(gzip_obj.get()["Body"], mode='rt')
reader = csv.reader(buffer)
for row in reader:
yield row
if __name__ == '__main__':
for s3_url in sys.argv[1:]:
url = urllib.parse.urlparse(s3_url)
for bucket, key, *rest in list_keys(url.hostname, url.path.lstrip('/')):
print(bucket, key, *rest)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment