Skip to content

Instantly share code, notes, and snippets.

@ewels
Last active March 19, 2023 16:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ewels/dcb0771e0ace7565c6a084f33ff0fa2d to your computer and use it in GitHub Desktop.
Save ewels/dcb0771e0ace7565c6a084f33ff0fa2d to your computer and use it in GitHub Desktop.
Analyse @nf-core AWS s3 storage
aws s3 ls s3://nf-core-awsmegatests --recursive > files.txt
#!/usr/bin/env python
from collections import defaultdict
import requests
import re
from rich import print
from rich.table import Table
from rich.console import Console
console = Console(record=True)
# Get pipelines
pipelines_r = requests.get("https://nf-co.re/pipelines.json")
pipelines = {}
for pipeline in pipelines_r.json()["remote_workflows"]:
pipelines[pipeline["name"]] = pipeline
pipeline_inputs = defaultdict(int)
pipeline_results_expected = defaultdict(int)
pipeline_results_expected_per_release = defaultdict(lambda: defaultdict(int))
pipeline_results_unexpected = defaultdict(int)
pipeline_results_unexpected_paths = set()
work_dir = defaultdict(int)
pipeline_other = defaultdict(int)
pipeline_other_paths_sizes = defaultdict(lambda: defaultdict(int))
other_toplevel = defaultdict(int)
with open("files.txt") as fh:
for line in fh:
timestamp = line[:19] # 2022-02-14 17:45:22
filesize_col = re.search(r"\s+\d+", line[19:]).group() # 205718877
filesize = int(filesize_col.strip()) # 205718877
path = line[len(str(filesize_col)) + 20 :]
if filesize == 0:
next
root, dir1, *_ = path.split("/")
if root in pipelines:
if "input" in dir1:
pipeline_inputs[root] += filesize
elif "results-" in dir1:
if dir1[8:] in [
release["tag_sha"] for release in pipelines[root]["releases"]
]:
pipeline_results_expected[root] += filesize
pipeline_results_expected_per_release[root][dir1] += filesize
else:
pipeline_results_unexpected[root] += filesize
pipeline_results_unexpected_paths.add(f"{root}/{dir1}")
else:
pipeline_other[root] += filesize
pipeline_other_paths_sizes[root][dir1] += filesize
else:
if root == "work":
work_dir[dir1] += filesize
else:
other_toplevel[root] += filesize
pipeline_results_expected_avg_size = {}
pipeline_results_releases = {}
pipeline_results_releases_total = 0
for pipeline, releases in pipeline_results_expected_per_release.items():
avg_size = sum(releases.values()) / len(releases)
pipeline_results_expected_avg_size[pipeline] = f"{avg_size/1000000000:.2f}GB"
pipeline_results_releases[pipeline] = len(releases)
pipeline_results_releases_total += len(releases)
def print_table(
title,
data,
extra_col=None,
extra_col_title=None,
extra_col_footer="",
extra_extra_col=None,
extra_extra_col_title=None,
extra_extra_col_footer="",
):
table = Table(title=title)
table.add_column("Pipeline", style="magenta")
table.add_column("File size", justify="right", style="green")
if extra_col:
table.add_column(extra_col_title, justify="right", style="yellow")
if extra_extra_col:
table.add_column(extra_extra_col_title, justify="right", style="cyan")
total = 0
for k, v in sorted(data.items(), key=lambda x: x[1], reverse=True):
row = [k, f"{(v/1000000000):.2f}GB"]
if extra_col:
row.append(extra_col.get(k))
if extra_extra_col:
row.append(str(extra_extra_col.get(k)))
table.add_row(*row)
total += v
table.add_section()
footer_row = [f"{len(data)} Pipelines", f"{(total/1000000000):.2f}GB"]
if extra_col:
footer_row.append(str(extra_col_footer))
if extra_extra_col:
footer_row.append(str(extra_extra_col_footer))
table.add_row(*footer_row, style="bold")
console.print(table)
console.print("\n\n")
return total
# Input data
grand_total = 0
grand_total += print_table("Pipeline Input Data", pipeline_inputs)
grand_total += print_table(
"Results from Release Commits",
pipeline_results_expected,
pipeline_results_expected_avg_size,
"Avg size per release",
"",
pipeline_results_releases,
"Number of releases",
pipeline_results_releases_total,
)
grand_total += print_table(
"Results from unexpected commits", pipeline_results_unexpected
)
grand_total += print_table("Unexpected stuff in pipeline directories", pipeline_other)
for pipeline, size in pipeline_other.items():
if size > 10000000000:
print_table(
f"Unexpected stuff in {pipeline}",
pipeline_other_paths_sizes[pipeline],
)
grand_total += print_table("Work directory", work_dir)
grand_total += print_table("Other random pipeline data", other_toplevel)
console.print(f"GRAND TOTAL: {grand_total/1000000000:.2f}GB")
with open("unexpected_results.txt", "w") as fh:
fh.write("\n".join(sorted(pipeline_results_unexpected_paths)))
# console.save_html("report.html")
console.save_text("report.txt")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment