Skip to content

Instantly share code, notes, and snippets.

@gurchik
Last active June 2, 2023 02:59
Show Gist options
  • Save gurchik/071ef68d52d64dea08dae4f1afd168a0 to your computer and use it in GitHub Desktop.
Save gurchik/071ef68d52d64dea08dae4f1afd168a0 to your computer and use it in GitHub Desktop.
Analyze S3 Usage Report

Analyze S3 Usage Report

Format an AWS Usage Report for S3 charges to make it simpler to compare your usage to the public S3 Pricing page.

For example, the pricing page lists Tier 1 (i.e. PUT, COPY, POST, and LIST) requests at one cost and Tier 2 (GET, SELECT, and all others) requests at a different cost. However, the Usage Report lists requests by API call (e.g. GetObject, ListBucket, etc). This is a bit annoying, so this script parses them into their respective billing tiers.

Instructions

  1. Navigate to AWS Billing. Select "Cost & usage reports" then "Create a usage report"
  2. Enter "Amazon Simple Storage Service" as the Service, "All usage types" for the Usage Type, "All Operations" for the Operations. Then select a time period and granularity.
  3. Download the report as a CSV.
  4. Run tail -1 <FILE> - if the output is "The report for the period and values you specified was too large..." then you must select a shorter time period or a larger granularity (or both) and redownload the file.
  5. Run python3 analyze_s3_usage_report.py <FILE>
import sys
import csv
from collections import defaultdict
import re
CSV_PATH = sys.argv[1]
def load_csv():
ret = []
path = sys.argv[1]
with open(CSV_PATH) as file:
reader = csv.DictReader(file)
for row in reader:
parsed = {
"bucket": row[" Resource"],
"usage_type": row[" UsageType"],
"usage_value": int(row[" UsageValue"]),
}
if not parsed["bucket"]:
continue
if "Out-Bytes" not in parsed["usage_type"] and "Requests" not in parsed["usage_type"]:
continue
ret.append(parsed)
return ret
def parse_usage_type(usage):
# Based on: https://docs.aws.amazon.com/AmazonS3/latest/userguide/aws-usage-report-understand.html
if re.match(r'(\w+-)?DataTransfer-Out-Bytes', usage): # The amount of data transferred from Amazon S3 to the internet
return "DataTransfer-ToInternet"
if (match := re.match(r'(\w+-)?(\w+)-AWS-Out-Bytes', usage)): # The amount of data transferred from AWS Region1 to AWS Region2
region = match.group(2)
return f"DataTransfer-To{region}"
if re.match(r'(\w+-)?C3DataTransfer-Out-Bytes', usage): # The amount of data transferred from Amazon S3 to Amazon EC2 within the same AWS Region
return "DataTransfer-ToEC2-NoCharge"
if re.match(r'(\w+-)?CloudFront-Out-Bytes', usage): # The amount of data transferred from an AWS Region to a CloudFront distribution
return "DataTransfer-ToCloudFront-NoCharge"
if (match := re.match(r'(\w+-)?Requests-(\w+)', usage)):
tier = match.group(2)
return f"Requests-{tier}"
if re.match(r'(\w+-)?AMZN-Out-Bytes', usage): # Undocumented. From S3 Support: "Traffic went to an AWS IP, but the IP is not included in any region's ranges. Because the destination region is unknown, there is no charge."
return "DataTransfer-ToAWS-NoCharge"
else:
raise Exception(f"Unknown usage type '{usage}'")
def parse_report(csv):
resources = defaultdict(lambda: defaultdict(int))
for row in csv:
bucket = row["bucket"]
usage_value = row["usage_value"]
usage_type = parse_usage_type(row["usage_type"])
resources[bucket][usage_type] += usage_value
return resources
def print_report(report):
print("Bucket,UsageType,UsageAmount")
for bucket, usages in report.items():
for usage_type, usage_amount in usages.items():
print(f"{bucket},{usage_type},{usage_amount}")
if __name__ == "__main__":
csv = load_csv()
report = parse_report(csv)
print_report(report)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment