Last active
January 11, 2024 01:42
-
-
Save ajelenak/9f2a634c47426f35cdb897d6a64052ef to your computer and use it in GitHub Desktop.
Additional HDF5 dataset chunk statistics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
from typing import Union | |
from dataclasses import dataclass | |
import argparse | |
from functools import partial, reduce | |
import operator | |
import math | |
import h5py | |
@dataclass(slots=True, frozen=True) | |
class ChunkStats: | |
"""Various chunk statistics for one HDF5 dataset.""" | |
name: str | |
num_stored: int | |
size: int | |
stor_size: int | |
extent_ratio: float | |
page_bins: dict | |
page_spread_anomaly: int | |
def __post_init__(self): | |
if self.extent_ratio > 1: | |
raise ValueError(f'Chunk shape ratio greater than 1 for {self.name}') | |
if self.page_spread_anomaly < 0: | |
raise ValueError(f'Chunk file page spread anomaly negative for {self.name}') | |
def get_cli_args(): | |
parser = argparse.ArgumentParser( | |
description='Provide collective dataset chunk stats that h5stat does not do.', | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument('h5file', help='Input HDF5 file name.', type=str) | |
parser.add_argument('--show', help='Print individual dataset stats', | |
action='store_true') | |
parser.add_argument('--json', help='Format individual dataset stats in JSON', | |
action='store_true') | |
return parser.parse_args() | |
def chunk_to_shape_ratio(chunk: tuple, shape: tuple) -> float: | |
"""Ratio of chunk to dataset shape extent.""" | |
ratio = 1 | |
for c, s in zip(chunk, shape): | |
try: | |
ratio *= min(1, c/s) | |
except ZeroDivisionError: | |
# Deal with 1D datasets without data... | |
continue | |
return ratio | |
def chunk2page(dset: h5py.Dataset, page_size: int) -> dict: | |
"""Determine file page for each chunk. | |
Only for files with "PAGE" file space strategy. | |
""" | |
def chunk_info(chunk_stor, stinfo): | |
start_page = math.floor(chunk_stor.byte_offset / page_size) + 1 | |
end_page = math.floor((chunk_stor.byte_offset + chunk_stor.size - 1) / page_size) + 1 | |
if start_page != end_page: | |
raise ValueError(f'Chunk crosses file page boundary: {chunk_stor}') | |
stinfo[start_page] += 1 | |
stinfo = defaultdict(int) | |
dset.id.chunk_iter(partial(chunk_info, stinfo=stinfo)) | |
return stinfo | |
def dset_stats( | |
name: str, | |
h5obj: Union[h5py.Group, h5py.Dataset], | |
dset_list: list[ChunkStats], | |
page_size: int = 0 | |
) -> None: | |
if isinstance(h5obj, h5py.Dataset): | |
chunk_shape = h5obj.chunks | |
if chunk_shape: | |
chunk_nelem = reduce(operator.mul, chunk_shape, 1) | |
if page_size: | |
chunk_page = chunk2page(h5obj, page_size) | |
num_chunks = reduce(operator.add, chunk_page.values(), 0) | |
stored_size = h5obj.id.get_storage_size() | |
page_spread = len(chunk_page) - math.ceil(stored_size / page_size) | |
else: | |
num_chunks = h5obj.id.get_num_chunks() | |
stored_size = h5obj.id.get_storage_size() | |
chunk_page = dict() | |
page_spread = 0 | |
dset_list.append(ChunkStats( | |
name=h5obj.name, | |
num_stored=num_chunks, | |
extent_ratio=chunk_to_shape_ratio(chunk_shape, h5obj.shape), | |
stor_size=stored_size, | |
size=h5obj.id.get_type().get_size() * chunk_nelem, | |
page_bins=chunk_page, | |
page_spread_anomaly=page_spread | |
)) | |
cli = get_cli_args() | |
num_chunks = [ | |
{'count': 0, 'text': '# of datasets with no chunks'}, | |
{'count': 0, 'text': '# of datasets with 1 chunk'}, | |
{'count': 0, 'text': '# of datasets with chunks 2 - 10'}, | |
{'count': 0, 'text': '# of datasets with chunks 11 - 100'}, | |
{'count': 0, 'text': '# of datasets with chunks 101 - 1000'}, | |
{'count': 0, 'text': '# of datasets with chunks 1001 - 10_000'}, | |
{'count': 0, 'text': '# of datasets with chunks 10_001 - 100_000'}, | |
{'count': 0, 'text': '# of datasets with chunks > 100_000'}, | |
] | |
chunk_size = [ | |
{'count': 0, 'text': '# of datasets with chunk sizes 0 - 1000 bytes'}, | |
{'count': 0, 'text': '# of datasets with chunk sizes 1001 - 10_000 bytes'}, | |
{'count': 0, 'text': '# of datasets with chunk sizes 10_001 - 100_000 bytes'}, | |
{'count': 0, 'text': '# of datasets with chunk sizes 100_001 - 1_000_000 bytes'}, | |
{'count': 0, 'text': '# of datasets with chunk sizes 1_000_001 - 10_000_000 bytes'}, | |
{'count': 0, 'text': '# of datasets with chunk sizes > 10_000_000 bytes'}, | |
] | |
chunk_ext = [ | |
{'count': 0, 'text': '# of datasets with ratio <= 0.1%', 'min_val': 0, 'max_val': 0.001}, | |
{'count': 0, 'text': '# of datasets with 0.1% < ratio <= 0.2%', 'min_val': 0.001, 'max_val': 0.002}, | |
{'count': 0, 'text': '# of datasets with 0.2% < ratio <= 0.3%', 'min_val': 0.002, 'max_val': 0.003}, | |
{'count': 0, 'text': '# of datasets with 0.3% < ratio <= 0.4%', 'min_val': 0.003, 'max_val': 0.004}, | |
{'count': 0, 'text': '# of datasets with 0.4% < ratio <= 0.5%', 'min_val': 0.004, 'max_val': 0.005}, | |
{'count': 0, 'text': '# of datasets with 0.5% < ratio <= 1%', 'min_val': 0.005, 'max_val': 0.01}, | |
{'count': 0, 'text': '# of datasets with 1% < ratio <= 2%', 'min_val': 0.01, 'max_val': 0.02}, | |
{'count': 0, 'text': '# of datasets with 2% < ratio <= 3%', 'min_val': 0.02, 'max_val': 0.03}, | |
{'count': 0, 'text': '# of datasets with 3% < ratio <= 4%', 'min_val': 0.03, 'max_val': 0.04}, | |
{'count': 0, 'text': '# of datasets with 4% < ratio <= 5%', 'min_val': 0.04, 'max_val': 0.05}, | |
{'count': 0, 'text': '# of datasets with 5% < ratio <= 10%', 'min_val': 0.05, 'max_val': 0.1}, | |
{'count': 0, 'text': '# of datasets with 10% < ratio <= 25%', 'min_val': 0.1, 'max_val': 0.25}, | |
{'count': 0, 'text': '# of datasets with ratio > 25%', 'min_val': 0.25, 'max_val': 1}, | |
] | |
chunk_pages = [ | |
{'count': 0, 'text': '# of datasets with chunks in 1 file page'}, | |
{'count': 0, 'text': '# of datasets with chunks in 2 file pages'}, | |
{'count': 0, 'text': '# of datasets with chunks in 3 file pages'}, | |
{'count': 0, 'text': '# of datasets with chunks in 4 file pages'}, | |
{'count': 0, 'text': '# of datasets with chunks in 5 file pages'}, | |
{'count': 0, 'text': '# of datasets with chunks in >5 file pages'}, | |
] | |
page_anomaly = [ | |
{'count': 0, 'text': '# of datasets entirely in 1 file page'}, | |
{'count': 0, 'text': '# of datasets in 1 extra file page'}, | |
{'count': 0, 'text': '# of datasets in 2 extra file pages'}, | |
{'count': 0, 'text': '# of datasets in 3 extra file pages'}, | |
{'count': 0, 'text': '# of datasets in >3 extra file pages'}, | |
] | |
most_page = [ | |
{'count': 0, 'text': '# of datasets in 0 - 10%'}, | |
{'count': 0, 'text': '# of datasets in 10 - 20%'}, | |
{'count': 0, 'text': '# of datasets in 20 - 30%'}, | |
{'count': 0, 'text': '# of datasets in 30 - 40%'}, | |
{'count': 0, 'text': '# of datasets in 40 - 50%'}, | |
{'count': 0, 'text': '# of datasets in 50 - 60%'}, | |
{'count': 0, 'text': '# of datasets in 60 - 70%'}, | |
{'count': 0, 'text': '# of datasets in 70 - 80%'}, | |
{'count': 0, 'text': '# of datasets in 80 - 90%'}, | |
{'count': 0, 'text': '# of datasets in 90 - 100%'}, | |
] | |
dset_info = list() | |
with h5py.File(cli.h5file, mode='r') as f: | |
fcpl = f.id.get_create_plist() | |
page = fcpl.get_file_space_strategy()[0] == h5py.h5f.FSPACE_STRATEGY_PAGE | |
if page: | |
page_size = fcpl.get_file_space_page_size() | |
else: | |
f.visititems(partial(dset_stats, dset_list=dset_info, page_size=0)) | |
if page and page_size: | |
with h5py.File(cli.h5file, mode='r', page_buf_size=4 * page_size) as f: | |
f.visititems(partial(dset_stats, dset_list=dset_info, page_size=page_size)) | |
if cli.show: | |
if cli.json: | |
print('[') | |
flag = False | |
for _ in sorted(dset_info, key=lambda d: d.name): | |
if flag: | |
print(',') | |
else: | |
flag = True | |
if page: | |
print(f'{{"dataset":"{_.name}","stored_size":{_.stor_size},"chunks_stored":{_.num_stored},' | |
f'"chunk_size":{_.size},"chunk_shape_ratio":{_.extent_ratio:.6g},' | |
f'"file_pages":{len(_.page_bins)},"page_spread_anomaly":{_.page_spread_anomaly}}}', end='') | |
else: | |
print(f'{{"dataset":"{_.name}","stored_size":{_.stor_size},"chunks_stored":{_.num_stored},' | |
f'"chunk_size":{_.size},"chunk_shape_ratio":{_.extent_ratio:.6g}}}', end='') | |
print('\n]') | |
else: | |
for _ in sorted(dset_info, key=lambda d: d.name): | |
if page: | |
print(f'dataset {_.name} stored_size={_.stor_size} chunks_stored={_.num_stored}' | |
f' chunk_size={_.size} chunk_shape_ratio={_.extent_ratio:.6g}' | |
f' file_pages={len(_.page_bins)} page_spread_anomaly={_.page_spread_anomaly}') | |
else: | |
print(f'dataset {_.name} stored_size={_.stor_size} chunks_stored={_.num_stored}' | |
f' chunk_size={_.size} chunk_shape_ratio={_.extent_ratio:.6g}') | |
raise SystemExit() | |
for ds_info in dset_info: | |
try: | |
num_chunks[math.ceil(math.log10(ds_info.num_stored)) + 1]['count'] += 1 | |
except IndexError: | |
num_chunks[-1]['count'] += 1 | |
except ValueError: | |
num_chunks[0]['count'] += 1 | |
try: | |
chunk_size[max(3, math.ceil(math.log10(ds_info.size))) - 3]['count'] += 1 | |
except IndexError: | |
chunk_size[-1]['count'] += 1 | |
for i, ce in enumerate(chunk_ext): | |
if ce['min_val'] < ds_info.extent_ratio <= ce['max_val']: | |
ce['count'] += 1 | |
break | |
if page: | |
for ds_info in dset_info: | |
try: | |
chunk_pages[len(ds_info.page_bins) - 1]['count'] += 1 | |
except IndexError: | |
chunk_pages[-1]['count'] += 1 | |
try: | |
page_anomaly[ds_info.page_spread_anomaly]['count'] += 1 | |
except IndexError: | |
page_anomaly[-1]['count'] += 1 | |
max_prcnt = max([_ / ds_info.num_stored for _ in ds_info.page_bins.values()]) | |
try: | |
most_page[math.floor(max_prcnt * 10)]['count'] += 1 | |
except IndexError: | |
most_page[-1]['count'] += 1 | |
chunked_dsets = len(dset_info) | |
for blah in (num_chunks, chunk_size, chunk_ext): | |
if sum([_['count'] for _ in blah]) != chunked_dsets: | |
raise ValueError('Sanity check failed: Number of chunked datasets different') | |
print(f'Dataset chunk statistics for {cli.h5file}:') | |
if page: | |
print(f'"PAGE" file space strategy with page size of {page_size:,} bytes.') | |
print(f'\tTotal chunked datasets: {chunked_dsets}') | |
print('\tChunk bins:') | |
for _ in num_chunks: | |
print(f"\t\t{_['text']}: {_['count']}") | |
print('\tChunk size bins:') | |
for _ in chunk_size: | |
print(f"\t\t{_['text']}: {_['count']}") | |
print('\tChunk to dataset shape ratio bins:') | |
for _ in chunk_ext: | |
print(f"\t\t{_['text']}: {_['count']}") | |
if page: | |
print('\tChunk file pages bins:') | |
for _ in chunk_pages: | |
print(f"\t\t{_['text']}: {_['count']}") | |
print('\tChunk file page spread anomaly (extra file pages based on total dataset storage size):') | |
for _ in page_anomaly: | |
print(f"\t\t{_['text']}: {_['count']}") | |
print('\tMaximum percentage of chunks in a single file page:') | |
for _ in most_page: | |
print(f"\t\t{_['text']}: {_['count']}") |
Updated with three new stats about dataset chunks in files with PAGE file space strategy.
Added JSON format output and a few bug fixes.
Fix JSON output to be compliant.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Requires a more recent h5py, recommend at least version 3.9. Run it with
--help
to see available options.