ajelenak/h5stat-extra.py

## h5stat-chunks.py
import argparse
import json
import operator
from collections import defaultdict
from dataclasses import dataclass
from functools import partial, reduce
from typing import Union

import h5py
import numpy as np
from tabulate import tabulate


if h5py.h5.get_libversion() < (1, 14, 3):
    raise RuntimeError("Requires HDF5 library 1.14.3 or later")
elif not h5py.h5.get_config().ros3:
    pass
    # raise RuntimeError('HDF5 library must be built with ROS3 virtual file driver')


def get_cli_args():
    """Command-line arguments."""
    parser = argparse.ArgumentParser(
        description="Provide collective dataset chunk stats that h5stat does not do.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("h5file", help="Input HDF5 file name.", type=str)
    parser.add_argument(
        "--show", help="Print individual dataset stats", action="store_true"
    )
    parser.add_argument(
        "--json", help="Format individual dataset stats in JSON", action="store_true"
    )
    return parser.parse_args()


@dataclass(slots=True, frozen=True)
class ChunkStats:
    """Various chunk statistics for one HDF5 dataset."""

    name: str
    num_stored: int
    size: int
    stor_size: int
    min_size: int
    max_size: int
    extent_ratio: float
    page_bins: dict
    page_spread_anomaly: int

    def __post_init__(self):
        if self.extent_ratio > 1:
            raise ValueError(f"Chunk shape ratio greater than 1 for {self.name}")
        if self.page_spread_anomaly < 0:
            raise ValueError(f"Chunk file page spread anomaly negative for {self.name}")

    def to_dict(self):
        d = {
            "dataset": self.name,
            "chunks_stored": self.num_stored,
            "chunk_size": self.size,
            "stored_size": self.stor_size,
            "min_stored_chunk_size": self.min_size,
            "max_stored_chunk_size": self.max_size,
            "chunk_shape_ratio": self.extent_ratio,
        }
        if len(self.page_bins):
            d.update(
                {
                    "file_pages": self.page_bins,
                    "page_spread_anomaly": self.page_spread_anomaly,
                }
            )
        return d


def chunk_to_shape_ratio(chunk: tuple, shape: tuple) -> float:
    """Ratio of chunk to dataset shape extent."""
    ratio = 1
    for c, s in zip(chunk, shape):
        try:
            ratio *= min(1, c / s)
        except ZeroDivisionError:
            # Deal with 1D datasets without data...
            continue
    return ratio


def chunk_size_minmax(dset: h5py.Dataset) -> tuple[int, int]:
    """Find the smallest and largest chunk size for one HDF5 dataset."""
    chunk_sizes = list()

    def chunk_info(chunk_stor):
        chunk_sizes.append(chunk_stor.size)

    dset.id.chunk_iter(chunk_info)

    return min(chunk_sizes), max(chunk_sizes)


def chunk2page(dset: h5py.Dataset, page_size: int) -> dict:
    """Determine file page for each chunk.

    Only for files with "PAGE" file space strategy.
    """
    stinfo = defaultdict(int)

    def chunk_info(chunk_stor):
        start_page = np.floor(chunk_stor.byte_offset / page_size).astype(int) + 1
        end_page = (
            np.floor((chunk_stor.byte_offset + chunk_stor.size - 1) / page_size).astype(
                int
            )
            + 1
        )
        if start_page != end_page:
            raise ValueError(f"Chunk crosses file page boundary: {chunk_stor}")
        stinfo[start_page] += 1

    dset.id.chunk_iter(chunk_info)

    return stinfo


def dset_stats(
    name: str,
    h5obj: Union[h5py.Group, h5py.Dataset],
    dset_list: list[ChunkStats],
    page_size: int = 0,
) -> None:
    if isinstance(h5obj, h5py.Dataset):
        chunk_shape = h5obj.chunks
        if chunk_shape:
            chunk_nelem = reduce(operator.mul, chunk_shape, 1)
            if page_size:
                chunk_page = chunk2page(h5obj, page_size)
                num_chunks = reduce(operator.add, chunk_page.values(), 0)
                stored_size = h5obj.id.get_storage_size()
                page_spread = len(chunk_page) - np.ceil(stored_size / page_size).astype(
                    int
                )
            else:
                num_chunks = h5obj.id.get_num_chunks()
                stored_size = h5obj.id.get_storage_size()
                chunk_page = dict()
                page_spread = 0
            min_size, max_size = chunk_size_minmax(h5obj)
            dset_list.append(
                ChunkStats(
                    name=h5obj.name,
                    num_stored=num_chunks,
                    extent_ratio=chunk_to_shape_ratio(chunk_shape, h5obj.shape),
                    stor_size=stored_size,
                    min_size=min_size,
                    max_size=max_size,
                    size=h5obj.id.get_type().get_size() * chunk_nelem,
                    page_bins=chunk_page,
                    page_spread_anomaly=page_spread,
                )
            )


def chunk_stats_table(
    bin_hdr: str,
    bins: list,
    bin_fmt: Union[str, list[str]],
    stats_hdr: str,
    data: np.ndarray,
) -> str:
    # Calculate the histograms...
    hist, bins_ = np.histogram(data, bins=bins)
    bin_prcnt = 100 * hist / np.sum(hist)
    bin_cumsum_prcnt = 100 * np.cumsum(hist) / np.sum(hist)

    # Headers...
    prcnt_hdr = "% of total\nchunk. datasets"
    cumcum_prcnt_hdr = "cusum % of total\nchunk. datasets"
    # headers = [bin_hdr, stats_hdr, prcnt_hdr, cumcum_prcnt_hdr]

    tablefmt = "grid"
    if isinstance(bin_fmt, list):
        return tabulate(
            {
                bin_hdr: bin_fmt,
                stats_hdr: hist,
                prcnt_hdr: np.round(bin_prcnt, decimals=2),
                cumcum_prcnt_hdr: np.round(bin_cumsum_prcnt, decimals=2),
            },
            headers="keys",
            tablefmt=tablefmt,
        )
    else:
        return tabulate(
            {
                bin_hdr: [
                    f"{bins_[i]:{bin_fmt}} ≤ # < {bins[i+1]:{bin_fmt}}"
                    for i in range(len(bins_) - 1)
                ],
                stats_hdr: hist,
                prcnt_hdr: np.round(bin_prcnt, decimals=2),
                cumcum_prcnt_hdr: np.round(bin_cumsum_prcnt, decimals=2),
            },
            headers="keys",
            tablefmt=tablefmt,
        )


# ---------------------------------------------------------------------------- #

cli = get_cli_args()

dset_info = list()
with h5py.File(cli.h5file, mode="r") as f:
    fcpl = f.id.get_create_plist()
    page = fcpl.get_file_space_strategy()[0] == h5py.h5f.FSPACE_STRATEGY_PAGE
    if page:
        page_size = fcpl.get_file_space_page_size()
    else:
        f.visititems(partial(dset_stats, dset_list=dset_info, page_size=0))

if page and page_size:
    with h5py.File(cli.h5file, mode="r", page_buf_size=4 * page_size) as f:
        f.visititems(partial(dset_stats, dset_list=dset_info, page_size=page_size))

if cli.show:
    if cli.json:
        print(
            json.dumps([_.to_dict() for _ in sorted(dset_info, key=lambda d: d.name)])
        )
    else:
        for _ in sorted(dset_info, key=lambda d: d.name):
            if page:
                print(
                    f"dataset {_.name} stored_size={_.stor_size} chunks_stored={_.num_stored}"
                    f" chunk_size={_.size} chunk_shape_ratio={_.extent_ratio:.6g}"
                    f" file_pages={len(_.page_bins)} page_spread_anomaly={_.page_spread_anomaly}"
                )
            else:
                print(
                    f"dataset {_.name} stored_size={_.stor_size} chunks_stored={_.num_stored}"
                    f" chunk_size={_.size} chunk_shape_ratio={_.extent_ratio:.6g}"
                )
    raise SystemExit()

print(f"\nDataset chunk statistics for {cli.h5file}:")
print(f"Chunked datasets in the file: {len(dset_info)}")
if page:
    print(f'"PAGE" file space strategy with page size of {page_size:,} bytes.')
print("\n")

print(
    chunk_stats_table(
        "Chunk size in bytes",
        [0, 10, 1000, 10000, 100_000, 1_000_000, 10_000_000, np.inf],
        ".0e",
        "# chunked\ndatasets",
        [_.size for _ in dset_info],
    ),
    end="\n\n\n",
)

print(
    chunk_stats_table(
        "Chunk to dataset\nshape ratio",
        [
            0,
            0.001,
            0.002,
            0.003,
            0.004,
            0.005,
            0.01,
            0.02,
            0.03,
            0.04,
            0.05,
            0.1,
            0.25,
            1,
        ],
        ".3f",
        "# chunked\ndatasets",
        [_.extent_ratio for _ in dset_info],
    ),
    end="\n\n\n",
)

print(
    chunk_stats_table(
        "Chunks stored",
        [0, 1, 2, 10, 100, 1000, 10000, 100_000, np.inf],
        [
            "No chunks",
            "1 chunk",
            "2-9 chunks",
            "10-99 chunks",
            "100-999 chunks",
            "1000-9999 chunks",
            "10,000-99,999 chunks",
            "100,000 or more chunks",
        ],
        "# chunked\ndatasets",
        [_.num_stored for _ in dset_info],
    ),
    end="\n\n\n",
)

MiB = 1024 * 1024
print(
    chunk_stats_table(
        "Chunk cache size",
        [0, 1 * MiB, 4 * MiB, 8 * MiB, 16 * MiB, np.inf],
        ["1 MiB", "4 MiB", "8 MiB", "16 MiB", "> 16 MiB"],
        "# chunked\ndatasets",
        [_.size * _.num_stored for _ in dset_info],
    ),
    end="\n\n\n",
)

if page:
    print(
        chunk_stats_table(
            "# of file pages\nholding all chunks",
            [1, 2, 3, 4, 5, 6, 10, 15, 20, 25, 30, np.inf],
            [
                "1 page",
                "2 pages",
                "3 pages",
                "4 pages",
                "5 pages",
                "6 - 9 pages",
                "10 - 14 pages",
                "15 - 19 pages",
                "20 - 24 pages",
                "25 - 29 pages",
                "30 or more pages",
            ],
            "# chunked\ndatasets",
            [len(_.page_bins) for _ in dset_info],
        ),
        end="\n\n\n",
    )

    print(
        chunk_stats_table(
            "# file pages anomaly",
            [0, 1, 2, 3, 4, 5, np.inf],
            [
                "No extra file pages",
                "1 extra file page",
                "2 extra file pages",
                "3 extra file pages",
                "4 extra file pages",
                "5 or more extra file pages",
            ],
            "# chunked\ndatasets",
            [_.page_spread_anomaly for _ in dset_info],
        ),
        end="\n\n\n",
    )

    print(
        chunk_stats_table(
            "Max % of chunks\nin one file page",
            [0, 20, 40, 60, 80, 100],
            ".0f",
            "# chunked\ndatasets",
            [
                max(map(lambda x: 100 * x / _.num_stored, _.page_bins.values()))
                for _ in dset_info
            ],
        ),
        end="\n\n\n",
    )
	import argparse
	import json
	import operator
	from collections import defaultdict
	from dataclasses import dataclass
	from functools import partial, reduce
	from typing import Union

	import h5py
	import numpy as np
	from tabulate import tabulate


	if h5py.h5.get_libversion() < (1, 14, 3):
	raise RuntimeError("Requires HDF5 library 1.14.3 or later")
	elif not h5py.h5.get_config().ros3:
	pass
	# raise RuntimeError('HDF5 library must be built with ROS3 virtual file driver')


	def get_cli_args():
	"""Command-line arguments."""
	parser = argparse.ArgumentParser(
	description="Provide collective dataset chunk stats that h5stat does not do.",
	formatter_class=argparse.ArgumentDefaultsHelpFormatter,
	)
	parser.add_argument("h5file", help="Input HDF5 file name.", type=str)
	parser.add_argument(
	"--show", help="Print individual dataset stats", action="store_true"
	)
	parser.add_argument(
	"--json", help="Format individual dataset stats in JSON", action="store_true"
	)
	return parser.parse_args()


	@dataclass(slots=True, frozen=True)
	class ChunkStats:
	"""Various chunk statistics for one HDF5 dataset."""

	name: str
	num_stored: int
	size: int
	stor_size: int
	min_size: int
	max_size: int
	extent_ratio: float
	page_bins: dict
	page_spread_anomaly: int

	def __post_init__(self):
	if self.extent_ratio > 1:
	raise ValueError(f"Chunk shape ratio greater than 1 for {self.name}")
	if self.page_spread_anomaly < 0:
	raise ValueError(f"Chunk file page spread anomaly negative for {self.name}")

	def to_dict(self):
	d = {
	"dataset": self.name,
	"chunks_stored": self.num_stored,
	"chunk_size": self.size,
	"stored_size": self.stor_size,
	"min_stored_chunk_size": self.min_size,
	"max_stored_chunk_size": self.max_size,
	"chunk_shape_ratio": self.extent_ratio,
	}
	if len(self.page_bins):
	d.update(
	{
	"file_pages": self.page_bins,
	"page_spread_anomaly": self.page_spread_anomaly,
	}
	)
	return d


	def chunk_to_shape_ratio(chunk: tuple, shape: tuple) -> float:
	"""Ratio of chunk to dataset shape extent."""
	ratio = 1
	for c, s in zip(chunk, shape):
	try:
	ratio *= min(1, c / s)
	except ZeroDivisionError:
	# Deal with 1D datasets without data...
	continue
	return ratio


	def chunk_size_minmax(dset: h5py.Dataset) -> tuple[int, int]:
	"""Find the smallest and largest chunk size for one HDF5 dataset."""
	chunk_sizes = list()

	def chunk_info(chunk_stor):
	chunk_sizes.append(chunk_stor.size)

	dset.id.chunk_iter(chunk_info)

	return min(chunk_sizes), max(chunk_sizes)


	def chunk2page(dset: h5py.Dataset, page_size: int) -> dict:
	"""Determine file page for each chunk.

	Only for files with "PAGE" file space strategy.
	"""
	stinfo = defaultdict(int)

	def chunk_info(chunk_stor):
	start_page = np.floor(chunk_stor.byte_offset / page_size).astype(int) + 1
	end_page = (
	np.floor((chunk_stor.byte_offset + chunk_stor.size - 1) / page_size).astype(
	int
	)
	+ 1
	)
	if start_page != end_page:
	raise ValueError(f"Chunk crosses file page boundary: {chunk_stor}")
	stinfo[start_page] += 1

	dset.id.chunk_iter(chunk_info)

	return stinfo


	def dset_stats(
	name: str,
	h5obj: Union[h5py.Group, h5py.Dataset],
	dset_list: list[ChunkStats],
	page_size: int = 0,
	) -> None:
	if isinstance(h5obj, h5py.Dataset):
	chunk_shape = h5obj.chunks
	if chunk_shape:
	chunk_nelem = reduce(operator.mul, chunk_shape, 1)
	if page_size:
	chunk_page = chunk2page(h5obj, page_size)
	num_chunks = reduce(operator.add, chunk_page.values(), 0)
	stored_size = h5obj.id.get_storage_size()
	page_spread = len(chunk_page) - np.ceil(stored_size / page_size).astype(
	int
	)
	else:
	num_chunks = h5obj.id.get_num_chunks()
	stored_size = h5obj.id.get_storage_size()
	chunk_page = dict()
	page_spread = 0
	min_size, max_size = chunk_size_minmax(h5obj)
	dset_list.append(
	ChunkStats(
	name=h5obj.name,
	num_stored=num_chunks,
	extent_ratio=chunk_to_shape_ratio(chunk_shape, h5obj.shape),
	stor_size=stored_size,
	min_size=min_size,
	max_size=max_size,
	size=h5obj.id.get_type().get_size() * chunk_nelem,
	page_bins=chunk_page,
	page_spread_anomaly=page_spread,
	)
	)


	def chunk_stats_table(
	bin_hdr: str,
	bins: list,
	bin_fmt: Union[str, list[str]],
	stats_hdr: str,
	data: np.ndarray,
	) -> str:
	# Calculate the histograms...
	hist, bins_ = np.histogram(data, bins=bins)
	bin_prcnt = 100 * hist / np.sum(hist)
	bin_cumsum_prcnt = 100 * np.cumsum(hist) / np.sum(hist)

	# Headers...
	prcnt_hdr = "% of total\nchunk. datasets"
	cumcum_prcnt_hdr = "cusum % of total\nchunk. datasets"
	# headers = [bin_hdr, stats_hdr, prcnt_hdr, cumcum_prcnt_hdr]

	tablefmt = "grid"
	if isinstance(bin_fmt, list):
	return tabulate(
	{
	bin_hdr: bin_fmt,
	stats_hdr: hist,
	prcnt_hdr: np.round(bin_prcnt, decimals=2),
	cumcum_prcnt_hdr: np.round(bin_cumsum_prcnt, decimals=2),
	},
	headers="keys",
	tablefmt=tablefmt,
	)
	else:
	return tabulate(
	{
	bin_hdr: [
	f"{bins_[i]:{bin_fmt}} ≤ # < {bins[i+1]:{bin_fmt}}"
	for i in range(len(bins_) - 1)
	],
	stats_hdr: hist,
	prcnt_hdr: np.round(bin_prcnt, decimals=2),
	cumcum_prcnt_hdr: np.round(bin_cumsum_prcnt, decimals=2),
	},
	headers="keys",
	tablefmt=tablefmt,
	)


	# ---------------------------------------------------------------------------- #

	cli = get_cli_args()

	dset_info = list()
	with h5py.File(cli.h5file, mode="r") as f:
	fcpl = f.id.get_create_plist()
	page = fcpl.get_file_space_strategy()[0] == h5py.h5f.FSPACE_STRATEGY_PAGE
	if page:
	page_size = fcpl.get_file_space_page_size()
	else:
	f.visititems(partial(dset_stats, dset_list=dset_info, page_size=0))

	if page and page_size:
	with h5py.File(cli.h5file, mode="r", page_buf_size=4 * page_size) as f:
	f.visititems(partial(dset_stats, dset_list=dset_info, page_size=page_size))

	if cli.show:
	if cli.json:
	print(
	json.dumps([_.to_dict() for _ in sorted(dset_info, key=lambda d: d.name)])
	)
	else:
	for _ in sorted(dset_info, key=lambda d: d.name):
	if page:
	print(
	f"dataset {_.name} stored_size={_.stor_size} chunks_stored={_.num_stored}"
	f" chunk_size={_.size} chunk_shape_ratio={_.extent_ratio:.6g}"
	f" file_pages={len(_.page_bins)} page_spread_anomaly={_.page_spread_anomaly}"
	)
	else:
	print(
	f"dataset {_.name} stored_size={_.stor_size} chunks_stored={_.num_stored}"
	f" chunk_size={_.size} chunk_shape_ratio={_.extent_ratio:.6g}"
	)
	raise SystemExit()

	print(f"\nDataset chunk statistics for {cli.h5file}:")
	print(f"Chunked datasets in the file: {len(dset_info)}")
	if page:
	print(f'"PAGE" file space strategy with page size of {page_size:,} bytes.')
	print("\n")

	print(
	chunk_stats_table(
	"Chunk size in bytes",
	[0, 10, 1000, 10000, 100_000, 1_000_000, 10_000_000, np.inf],
	".0e",
	"# chunked\ndatasets",
	[_.size for _ in dset_info],
	),
	end="\n\n\n",
	)

	print(
	chunk_stats_table(
	"Chunk to dataset\nshape ratio",
	[
	0,
	0.001,
	0.002,
	0.003,
	0.004,
	0.005,
	0.01,
	0.02,
	0.03,
	0.04,
	0.05,
	0.1,
	0.25,
	1,
	],
	".3f",
	"# chunked\ndatasets",
	[_.extent_ratio for _ in dset_info],
	),
	end="\n\n\n",
	)

	print(
	chunk_stats_table(
	"Chunks stored",
	[0, 1, 2, 10, 100, 1000, 10000, 100_000, np.inf],
	[
	"No chunks",
	"1 chunk",
	"2-9 chunks",
	"10-99 chunks",
	"100-999 chunks",
	"1000-9999 chunks",
	"10,000-99,999 chunks",
	"100,000 or more chunks",
	],
	"# chunked\ndatasets",
	[_.num_stored for _ in dset_info],
	),
	end="\n\n\n",
	)

	MiB = 1024 * 1024
	print(
	chunk_stats_table(
	"Chunk cache size",
	[0, 1 * MiB, 4 * MiB, 8 * MiB, 16 * MiB, np.inf],
	["1 MiB", "4 MiB", "8 MiB", "16 MiB", "> 16 MiB"],
	"# chunked\ndatasets",
	[_.size * _.num_stored for _ in dset_info],
	),
	end="\n\n\n",
	)

	if page:
	print(
	chunk_stats_table(
	"# of file pages\nholding all chunks",
	[1, 2, 3, 4, 5, 6, 10, 15, 20, 25, 30, np.inf],
	[
	"1 page",
	"2 pages",
	"3 pages",
	"4 pages",
	"5 pages",
	"6 - 9 pages",
	"10 - 14 pages",
	"15 - 19 pages",
	"20 - 24 pages",
	"25 - 29 pages",
	"30 or more pages",
	],
	"# chunked\ndatasets",
	[len(_.page_bins) for _ in dset_info],
	),
	end="\n\n\n",
	)

	print(
	chunk_stats_table(
	"# file pages anomaly",
	[0, 1, 2, 3, 4, 5, np.inf],
	[
	"No extra file pages",
	"1 extra file page",
	"2 extra file pages",
	"3 extra file pages",
	"4 extra file pages",
	"5 or more extra file pages",
	],
	"# chunked\ndatasets",
	[_.page_spread_anomaly for _ in dset_info],
	),
	end="\n\n\n",
	)

	print(
	chunk_stats_table(
	"Max % of chunks\nin one file page",
	[0, 20, 40, 60, 80, 100],
	".0f",
	"# chunked\ndatasets",
	[
	max(map(lambda x: 100 * x / _.num_stored, _.page_bins.values()))
	for _ in dset_info
	],
	),
	end="\n\n\n",
	)