Skip to content

Instantly share code, notes, and snippets.

@rly
Created January 5, 2024 22:10
Show Gist options
  • Save rly/304f853ec6688e5f454c8985af563203 to your computer and use it in GitHub Desktop.
Save rly/304f853ec6688e5f454c8985af563203 to your computer and use it in GitHub Desktop.
Measure the space overhead of creating many HDF5 groups
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Summary of the h5py configuration\n",
"---------------------------------\n",
"\n",
"h5py 3.10.0\n",
"HDF5 1.14.2\n",
"Python 3.11.6 | packaged by conda-forge | (main, Oct 3 2023, 10:37:07) [Clang 15.0.7 ]\n",
"sys.platform darwin\n",
"sys.maxsize 9223372036854775807\n",
"numpy 1.26.0\n",
"cython (built with) 0.29.36\n",
"numpy (built against) 1.23.5\n",
"HDF5 (built against) 1.14.2\n",
"\n",
"link setting: True\n",
"10 groups\n",
"11912 bytes\n",
"1191.2 bytes per group\n",
"\n",
"read time: 0.0002732079883571714 seconds\n",
"read time per group: 2.732079883571714e-05 seconds per group\n",
"link setting: True\n",
"100 groups\n",
"112600 bytes\n",
"1126.0 bytes per group\n",
"\n",
"read time: 0.00022324998280964792 seconds\n",
"read time per group: 2.232499828096479e-06 seconds per group\n",
"link setting: True\n",
"1000 groups\n",
"1135568 bytes\n",
"1135.568 bytes per group\n",
"\n",
"read time: 0.00025691601331345737 seconds\n",
"read time per group: 2.5691601331345734e-07 seconds per group\n",
"link setting: True\n",
"1000000 groups\n",
"1135983064 bytes\n",
"1135.983064 bytes per group\n",
"\n",
"read time: 0.00045754198799841106 seconds\n",
"read time per group: 4.575419879984111e-10 seconds per group\n",
"link setting: False\n",
"10 groups\n",
"8632 bytes\n",
"863.2 bytes per group\n",
"\n",
"read time: 0.0002227500081062317 seconds\n",
"read time per group: 2.2275000810623168e-05 seconds per group\n",
"link setting: False\n",
"100 groups\n",
"79792 bytes\n",
"797.92 bytes per group\n",
"\n",
"read time: 0.00014183297753334045 seconds\n",
"read time per group: 1.4183297753334046e-06 seconds per group\n",
"link setting: False\n",
"1000 groups\n",
"807576 bytes\n",
"807.576 bytes per group\n",
"\n",
"read time: 0.00023925001733005047 seconds\n",
"read time per group: 2.3925001733005045e-07 seconds per group\n",
"link setting: False\n",
"1000000 groups\n",
"807983064 bytes\n",
"807.983064 bytes per group\n",
"\n",
"read time: 0.00045804199180565774 seconds\n",
"read time per group: 4.5804199180565776e-10 seconds per group\n"
]
}
],
"source": [
"import os\n",
"import time\n",
"import h5py\n",
"print(h5py.version.info)\n",
"\n",
"create_link_options = [True, False]\n",
"n_groups_options = [10, 100, 1000, int(1e6)]\n",
"for create_link in create_link_options:\n",
" for n_groups in n_groups_options:\n",
" print(\"link setting:\", create_link)\n",
" print(n_groups, \"groups\")\n",
"\n",
" # write many non-nested groups into the root group\n",
" with h5py.File(\"measure_hdf5_group_overhead.h5\", mode=\"w\") as f:\n",
" for i in range(n_groups):\n",
" f.create_group(\"group\" + str(i))\n",
" if create_link:\n",
" # create a link to the root group\n",
" f[\"group\" + str(i)][\"link\"] = f\n",
"\n",
" # measure the file size\n",
" file_size = os.path.getsize(\"measure_hdf5_group_overhead.h5\")\n",
" print(file_size, \"bytes\")\n",
" print(file_size / n_groups, \"bytes per group\")\n",
"\n",
" start = time.perf_counter()\n",
" with h5py.File(\"measure_hdf5_group_overhead.h5\", mode=\"r\") as f:\n",
" pass\n",
" end = time.perf_counter()\n",
" print(\"read time:\", end - start, \"seconds\")\n",
" print(\"read time per group:\", (end - start) / n_groups, \"seconds per group\")\n",
"\n",
" print()\n",
"\n",
"# note there may also be I/O costs in term of time to read/write\n",
"\n",
"# see also HDF5 reference for compact and indexed groups in \"Group implementations in HDF5\"\n",
"# https://docs.hdfgroup.org/hdf5/develop/group___h5_g.html\n",
"# see also https://forum.hdfgroup.org/t/group-overhead-file-size-problem-for-hierarchical-data/9640\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "dev",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment