Created
December 4, 2023 21:03
-
-
Save scottstanie/b1508ee1c24427ccdc0ac967aaab8569 to your computer and use it in GitHub Desktop.
Pretty print the size on disk + compression ratio of HDF5 datasets with h5py and rich
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# requirements: h5py, rich | |
import sys | |
import h5py | |
from rich.console import Console | |
from rich.table import Table | |
def _get_object_metadata(obj) -> tuple[str, float, float] | None: | |
if isinstance(obj, h5py.Dataset) and obj.chunks is not None: | |
if obj.id.get_create_plist().get_nfilters(): | |
stor_size = obj.id.get_storage_size() | |
if stor_size != 0: | |
return obj.name, obj.nbytes, stor_size | |
return None | |
def display_compression_ratios(file_path): | |
table = Table(title=f"Compression Ratios for HDF5 File: {file_path}") | |
table.add_column("Dataset", justify="left") | |
table.add_column("Storage Size (MiB)", justify="right") | |
table.add_column("Compression Ratio", justify="right") | |
total_stor_size = 0 | |
total_nbytes = 0 | |
with h5py.File(file_path, mode="r") as h5f: | |
def add_to_table(name, obj): | |
nonlocal total_stor_size, total_nbytes | |
result = _get_object_metadata(obj) | |
if result: | |
dataset_name, nbytes, stor_size = result | |
ratio = float(nbytes) / float(stor_size) | |
total_stor_size += stor_size | |
total_nbytes += nbytes | |
table.add_row( | |
dataset_name, f"{stor_size / (2**20):.2f}", f"{ratio:.3f}" | |
) | |
h5f.visititems(add_to_table) | |
total_ratio = total_nbytes / total_stor_size | |
table.add_row( | |
"<total chunked datasets>", | |
f"{total_stor_size / (2**20):.2f}", | |
f"{total_ratio:.3f}", | |
) | |
console = Console() | |
console.print(table) | |
if len(sys.argv) > 1: | |
for file_path in sys.argv[1:]: | |
display_compression_ratios(file_path) | |
else: | |
print(f"Usage: {sys.argv[0]} filepath [filepath [...]]") | |
raise SystemExit("No HDF5 file path given") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment