Skip to content

Instantly share code, notes, and snippets.

@ajelenak
Last active January 11, 2024 01:42
Show Gist options
  • Save ajelenak/9f2a634c47426f35cdb897d6a64052ef to your computer and use it in GitHub Desktop.
Save ajelenak/9f2a634c47426f35cdb897d6a64052ef to your computer and use it in GitHub Desktop.
Additional HDF5 dataset chunk statistics
from collections import defaultdict
from typing import Union
from dataclasses import dataclass
import argparse
from functools import partial, reduce
import operator
import math
import h5py
@dataclass(slots=True, frozen=True)
class ChunkStats:
"""Various chunk statistics for one HDF5 dataset."""
name: str
num_stored: int
size: int
stor_size: int
extent_ratio: float
page_bins: dict
page_spread_anomaly: int
def __post_init__(self):
if self.extent_ratio > 1:
raise ValueError(f'Chunk shape ratio greater than 1 for {self.name}')
if self.page_spread_anomaly < 0:
raise ValueError(f'Chunk file page spread anomaly negative for {self.name}')
def get_cli_args():
parser = argparse.ArgumentParser(
description='Provide collective dataset chunk stats that h5stat does not do.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('h5file', help='Input HDF5 file name.', type=str)
parser.add_argument('--show', help='Print individual dataset stats',
action='store_true')
parser.add_argument('--json', help='Format individual dataset stats in JSON',
action='store_true')
return parser.parse_args()
def chunk_to_shape_ratio(chunk: tuple, shape: tuple) -> float:
"""Ratio of chunk to dataset shape extent."""
ratio = 1
for c, s in zip(chunk, shape):
try:
ratio *= min(1, c/s)
except ZeroDivisionError:
# Deal with 1D datasets without data...
continue
return ratio
def chunk2page(dset: h5py.Dataset, page_size: int) -> dict:
"""Determine file page for each chunk.
Only for files with "PAGE" file space strategy.
"""
def chunk_info(chunk_stor, stinfo):
start_page = math.floor(chunk_stor.byte_offset / page_size) + 1
end_page = math.floor((chunk_stor.byte_offset + chunk_stor.size - 1) / page_size) + 1
if start_page != end_page:
raise ValueError(f'Chunk crosses file page boundary: {chunk_stor}')
stinfo[start_page] += 1
stinfo = defaultdict(int)
dset.id.chunk_iter(partial(chunk_info, stinfo=stinfo))
return stinfo
def dset_stats(
name: str,
h5obj: Union[h5py.Group, h5py.Dataset],
dset_list: list[ChunkStats],
page_size: int = 0
) -> None:
if isinstance(h5obj, h5py.Dataset):
chunk_shape = h5obj.chunks
if chunk_shape:
chunk_nelem = reduce(operator.mul, chunk_shape, 1)
if page_size:
chunk_page = chunk2page(h5obj, page_size)
num_chunks = reduce(operator.add, chunk_page.values(), 0)
stored_size = h5obj.id.get_storage_size()
page_spread = len(chunk_page) - math.ceil(stored_size / page_size)
else:
num_chunks = h5obj.id.get_num_chunks()
stored_size = h5obj.id.get_storage_size()
chunk_page = dict()
page_spread = 0
dset_list.append(ChunkStats(
name=h5obj.name,
num_stored=num_chunks,
extent_ratio=chunk_to_shape_ratio(chunk_shape, h5obj.shape),
stor_size=stored_size,
size=h5obj.id.get_type().get_size() * chunk_nelem,
page_bins=chunk_page,
page_spread_anomaly=page_spread
))
cli = get_cli_args()
num_chunks = [
{'count': 0, 'text': '# of datasets with no chunks'},
{'count': 0, 'text': '# of datasets with 1 chunk'},
{'count': 0, 'text': '# of datasets with chunks 2 - 10'},
{'count': 0, 'text': '# of datasets with chunks 11 - 100'},
{'count': 0, 'text': '# of datasets with chunks 101 - 1000'},
{'count': 0, 'text': '# of datasets with chunks 1001 - 10_000'},
{'count': 0, 'text': '# of datasets with chunks 10_001 - 100_000'},
{'count': 0, 'text': '# of datasets with chunks > 100_000'},
]
chunk_size = [
{'count': 0, 'text': '# of datasets with chunk sizes 0 - 1000 bytes'},
{'count': 0, 'text': '# of datasets with chunk sizes 1001 - 10_000 bytes'},
{'count': 0, 'text': '# of datasets with chunk sizes 10_001 - 100_000 bytes'},
{'count': 0, 'text': '# of datasets with chunk sizes 100_001 - 1_000_000 bytes'},
{'count': 0, 'text': '# of datasets with chunk sizes 1_000_001 - 10_000_000 bytes'},
{'count': 0, 'text': '# of datasets with chunk sizes > 10_000_000 bytes'},
]
chunk_ext = [
{'count': 0, 'text': '# of datasets with ratio <= 0.1%', 'min_val': 0, 'max_val': 0.001},
{'count': 0, 'text': '# of datasets with 0.1% < ratio <= 0.2%', 'min_val': 0.001, 'max_val': 0.002},
{'count': 0, 'text': '# of datasets with 0.2% < ratio <= 0.3%', 'min_val': 0.002, 'max_val': 0.003},
{'count': 0, 'text': '# of datasets with 0.3% < ratio <= 0.4%', 'min_val': 0.003, 'max_val': 0.004},
{'count': 0, 'text': '# of datasets with 0.4% < ratio <= 0.5%', 'min_val': 0.004, 'max_val': 0.005},
{'count': 0, 'text': '# of datasets with 0.5% < ratio <= 1%', 'min_val': 0.005, 'max_val': 0.01},
{'count': 0, 'text': '# of datasets with 1% < ratio <= 2%', 'min_val': 0.01, 'max_val': 0.02},
{'count': 0, 'text': '# of datasets with 2% < ratio <= 3%', 'min_val': 0.02, 'max_val': 0.03},
{'count': 0, 'text': '# of datasets with 3% < ratio <= 4%', 'min_val': 0.03, 'max_val': 0.04},
{'count': 0, 'text': '# of datasets with 4% < ratio <= 5%', 'min_val': 0.04, 'max_val': 0.05},
{'count': 0, 'text': '# of datasets with 5% < ratio <= 10%', 'min_val': 0.05, 'max_val': 0.1},
{'count': 0, 'text': '# of datasets with 10% < ratio <= 25%', 'min_val': 0.1, 'max_val': 0.25},
{'count': 0, 'text': '# of datasets with ratio > 25%', 'min_val': 0.25, 'max_val': 1},
]
chunk_pages = [
{'count': 0, 'text': '# of datasets with chunks in 1 file page'},
{'count': 0, 'text': '# of datasets with chunks in 2 file pages'},
{'count': 0, 'text': '# of datasets with chunks in 3 file pages'},
{'count': 0, 'text': '# of datasets with chunks in 4 file pages'},
{'count': 0, 'text': '# of datasets with chunks in 5 file pages'},
{'count': 0, 'text': '# of datasets with chunks in >5 file pages'},
]
page_anomaly = [
{'count': 0, 'text': '# of datasets entirely in 1 file page'},
{'count': 0, 'text': '# of datasets in 1 extra file page'},
{'count': 0, 'text': '# of datasets in 2 extra file pages'},
{'count': 0, 'text': '# of datasets in 3 extra file pages'},
{'count': 0, 'text': '# of datasets in >3 extra file pages'},
]
most_page = [
{'count': 0, 'text': '# of datasets in 0 - 10%'},
{'count': 0, 'text': '# of datasets in 10 - 20%'},
{'count': 0, 'text': '# of datasets in 20 - 30%'},
{'count': 0, 'text': '# of datasets in 30 - 40%'},
{'count': 0, 'text': '# of datasets in 40 - 50%'},
{'count': 0, 'text': '# of datasets in 50 - 60%'},
{'count': 0, 'text': '# of datasets in 60 - 70%'},
{'count': 0, 'text': '# of datasets in 70 - 80%'},
{'count': 0, 'text': '# of datasets in 80 - 90%'},
{'count': 0, 'text': '# of datasets in 90 - 100%'},
]
dset_info = list()
with h5py.File(cli.h5file, mode='r') as f:
fcpl = f.id.get_create_plist()
page = fcpl.get_file_space_strategy()[0] == h5py.h5f.FSPACE_STRATEGY_PAGE
if page:
page_size = fcpl.get_file_space_page_size()
else:
f.visititems(partial(dset_stats, dset_list=dset_info, page_size=0))
if page and page_size:
with h5py.File(cli.h5file, mode='r', page_buf_size=4 * page_size) as f:
f.visititems(partial(dset_stats, dset_list=dset_info, page_size=page_size))
if cli.show:
if cli.json:
print('[')
flag = False
for _ in sorted(dset_info, key=lambda d: d.name):
if flag:
print(',')
else:
flag = True
if page:
print(f'{{"dataset":"{_.name}","stored_size":{_.stor_size},"chunks_stored":{_.num_stored},'
f'"chunk_size":{_.size},"chunk_shape_ratio":{_.extent_ratio:.6g},'
f'"file_pages":{len(_.page_bins)},"page_spread_anomaly":{_.page_spread_anomaly}}}', end='')
else:
print(f'{{"dataset":"{_.name}","stored_size":{_.stor_size},"chunks_stored":{_.num_stored},'
f'"chunk_size":{_.size},"chunk_shape_ratio":{_.extent_ratio:.6g}}}', end='')
print('\n]')
else:
for _ in sorted(dset_info, key=lambda d: d.name):
if page:
print(f'dataset {_.name} stored_size={_.stor_size} chunks_stored={_.num_stored}'
f' chunk_size={_.size} chunk_shape_ratio={_.extent_ratio:.6g}'
f' file_pages={len(_.page_bins)} page_spread_anomaly={_.page_spread_anomaly}')
else:
print(f'dataset {_.name} stored_size={_.stor_size} chunks_stored={_.num_stored}'
f' chunk_size={_.size} chunk_shape_ratio={_.extent_ratio:.6g}')
raise SystemExit()
for ds_info in dset_info:
try:
num_chunks[math.ceil(math.log10(ds_info.num_stored)) + 1]['count'] += 1
except IndexError:
num_chunks[-1]['count'] += 1
except ValueError:
num_chunks[0]['count'] += 1
try:
chunk_size[max(3, math.ceil(math.log10(ds_info.size))) - 3]['count'] += 1
except IndexError:
chunk_size[-1]['count'] += 1
for i, ce in enumerate(chunk_ext):
if ce['min_val'] < ds_info.extent_ratio <= ce['max_val']:
ce['count'] += 1
break
if page:
for ds_info in dset_info:
try:
chunk_pages[len(ds_info.page_bins) - 1]['count'] += 1
except IndexError:
chunk_pages[-1]['count'] += 1
try:
page_anomaly[ds_info.page_spread_anomaly]['count'] += 1
except IndexError:
page_anomaly[-1]['count'] += 1
max_prcnt = max([_ / ds_info.num_stored for _ in ds_info.page_bins.values()])
try:
most_page[math.floor(max_prcnt * 10)]['count'] += 1
except IndexError:
most_page[-1]['count'] += 1
chunked_dsets = len(dset_info)
for blah in (num_chunks, chunk_size, chunk_ext):
if sum([_['count'] for _ in blah]) != chunked_dsets:
raise ValueError('Sanity check failed: Number of chunked datasets different')
print(f'Dataset chunk statistics for {cli.h5file}:')
if page:
print(f'"PAGE" file space strategy with page size of {page_size:,} bytes.')
print(f'\tTotal chunked datasets: {chunked_dsets}')
print('\tChunk bins:')
for _ in num_chunks:
print(f"\t\t{_['text']}: {_['count']}")
print('\tChunk size bins:')
for _ in chunk_size:
print(f"\t\t{_['text']}: {_['count']}")
print('\tChunk to dataset shape ratio bins:')
for _ in chunk_ext:
print(f"\t\t{_['text']}: {_['count']}")
if page:
print('\tChunk file pages bins:')
for _ in chunk_pages:
print(f"\t\t{_['text']}: {_['count']}")
print('\tChunk file page spread anomaly (extra file pages based on total dataset storage size):')
for _ in page_anomaly:
print(f"\t\t{_['text']}: {_['count']}")
print('\tMaximum percentage of chunks in a single file page:')
for _ in most_page:
print(f"\t\t{_['text']}: {_['count']}")
@ajelenak
Copy link
Author

ajelenak commented Sep 5, 2023

Requires a more recent h5py, recommend at least version 3.9. Run it with --help to see available options.

@ajelenak
Copy link
Author

Updated with three new stats about dataset chunks in files with PAGE file space strategy.

@ajelenak
Copy link
Author

Added JSON format output and a few bug fixes.

@ajelenak
Copy link
Author

Fix JSON output to be compliant.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment