Skip to content

Instantly share code, notes, and snippets.

@scottstanie
Created December 4, 2023 21:03
Show Gist options
  • Save scottstanie/b1508ee1c24427ccdc0ac967aaab8569 to your computer and use it in GitHub Desktop.
Save scottstanie/b1508ee1c24427ccdc0ac967aaab8569 to your computer and use it in GitHub Desktop.
Pretty print the size on disk + compression ratio of HDF5 datasets with h5py and rich
#!/usr/bin/env python
# requirements: h5py, rich
import sys
import h5py
from rich.console import Console
from rich.table import Table
def _get_object_metadata(obj) -> tuple[str, float, float] | None:
if isinstance(obj, h5py.Dataset) and obj.chunks is not None:
if obj.id.get_create_plist().get_nfilters():
stor_size = obj.id.get_storage_size()
if stor_size != 0:
return obj.name, obj.nbytes, stor_size
return None
def display_compression_ratios(file_path):
table = Table(title=f"Compression Ratios for HDF5 File: {file_path}")
table.add_column("Dataset", justify="left")
table.add_column("Storage Size (MiB)", justify="right")
table.add_column("Compression Ratio", justify="right")
total_stor_size = 0
total_nbytes = 0
with h5py.File(file_path, mode="r") as h5f:
def add_to_table(name, obj):
nonlocal total_stor_size, total_nbytes
result = _get_object_metadata(obj)
if result:
dataset_name, nbytes, stor_size = result
ratio = float(nbytes) / float(stor_size)
total_stor_size += stor_size
total_nbytes += nbytes
table.add_row(
dataset_name, f"{stor_size / (2**20):.2f}", f"{ratio:.3f}"
)
h5f.visititems(add_to_table)
total_ratio = total_nbytes / total_stor_size
table.add_row(
"<total chunked datasets>",
f"{total_stor_size / (2**20):.2f}",
f"{total_ratio:.3f}",
)
console = Console()
console.print(table)
if len(sys.argv) > 1:
for file_path in sys.argv[1:]:
display_compression_ratios(file_path)
else:
print(f"Usage: {sys.argv[0]} filepath [filepath [...]]")
raise SystemExit("No HDF5 file path given")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment