Created
January 5, 2024 22:10
-
-
Save rly/304f853ec6688e5f454c8985af563203 to your computer and use it in GitHub Desktop.
Measure the space overhead of creating many HDF5 groups
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Summary of the h5py configuration\n", | |
"---------------------------------\n", | |
"\n", | |
"h5py 3.10.0\n", | |
"HDF5 1.14.2\n", | |
"Python 3.11.6 | packaged by conda-forge | (main, Oct 3 2023, 10:37:07) [Clang 15.0.7 ]\n", | |
"sys.platform darwin\n", | |
"sys.maxsize 9223372036854775807\n", | |
"numpy 1.26.0\n", | |
"cython (built with) 0.29.36\n", | |
"numpy (built against) 1.23.5\n", | |
"HDF5 (built against) 1.14.2\n", | |
"\n", | |
"link setting: True\n", | |
"10 groups\n", | |
"11912 bytes\n", | |
"1191.2 bytes per group\n", | |
"\n", | |
"read time: 0.0002732079883571714 seconds\n", | |
"read time per group: 2.732079883571714e-05 seconds per group\n", | |
"link setting: True\n", | |
"100 groups\n", | |
"112600 bytes\n", | |
"1126.0 bytes per group\n", | |
"\n", | |
"read time: 0.00022324998280964792 seconds\n", | |
"read time per group: 2.232499828096479e-06 seconds per group\n", | |
"link setting: True\n", | |
"1000 groups\n", | |
"1135568 bytes\n", | |
"1135.568 bytes per group\n", | |
"\n", | |
"read time: 0.00025691601331345737 seconds\n", | |
"read time per group: 2.5691601331345734e-07 seconds per group\n", | |
"link setting: True\n", | |
"1000000 groups\n", | |
"1135983064 bytes\n", | |
"1135.983064 bytes per group\n", | |
"\n", | |
"read time: 0.00045754198799841106 seconds\n", | |
"read time per group: 4.575419879984111e-10 seconds per group\n", | |
"link setting: False\n", | |
"10 groups\n", | |
"8632 bytes\n", | |
"863.2 bytes per group\n", | |
"\n", | |
"read time: 0.0002227500081062317 seconds\n", | |
"read time per group: 2.2275000810623168e-05 seconds per group\n", | |
"link setting: False\n", | |
"100 groups\n", | |
"79792 bytes\n", | |
"797.92 bytes per group\n", | |
"\n", | |
"read time: 0.00014183297753334045 seconds\n", | |
"read time per group: 1.4183297753334046e-06 seconds per group\n", | |
"link setting: False\n", | |
"1000 groups\n", | |
"807576 bytes\n", | |
"807.576 bytes per group\n", | |
"\n", | |
"read time: 0.00023925001733005047 seconds\n", | |
"read time per group: 2.3925001733005045e-07 seconds per group\n", | |
"link setting: False\n", | |
"1000000 groups\n", | |
"807983064 bytes\n", | |
"807.983064 bytes per group\n", | |
"\n", | |
"read time: 0.00045804199180565774 seconds\n", | |
"read time per group: 4.5804199180565776e-10 seconds per group\n" | |
] | |
} | |
], | |
"source": [ | |
"import os\n", | |
"import time\n", | |
"import h5py\n", | |
"print(h5py.version.info)\n", | |
"\n", | |
"create_link_options = [True, False]\n", | |
"n_groups_options = [10, 100, 1000, int(1e6)]\n", | |
"for create_link in create_link_options:\n", | |
" for n_groups in n_groups_options:\n", | |
" print(\"link setting:\", create_link)\n", | |
" print(n_groups, \"groups\")\n", | |
"\n", | |
" # write many non-nested groups into the root group\n", | |
" with h5py.File(\"measure_hdf5_group_overhead.h5\", mode=\"w\") as f:\n", | |
" for i in range(n_groups):\n", | |
" f.create_group(\"group\" + str(i))\n", | |
" if create_link:\n", | |
" # create a link to the root group\n", | |
" f[\"group\" + str(i)][\"link\"] = f\n", | |
"\n", | |
" # measure the file size\n", | |
" file_size = os.path.getsize(\"measure_hdf5_group_overhead.h5\")\n", | |
" print(file_size, \"bytes\")\n", | |
" print(file_size / n_groups, \"bytes per group\")\n", | |
"\n", | |
" start = time.perf_counter()\n", | |
" with h5py.File(\"measure_hdf5_group_overhead.h5\", mode=\"r\") as f:\n", | |
" pass\n", | |
" end = time.perf_counter()\n", | |
" print(\"read time:\", end - start, \"seconds\")\n", | |
" print(\"read time per group:\", (end - start) / n_groups, \"seconds per group\")\n", | |
"\n", | |
" print()\n", | |
"\n", | |
"# note there may also be I/O costs in term of time to read/write\n", | |
"\n", | |
"# see also HDF5 reference for compact and indexed groups in \"Group implementations in HDF5\"\n", | |
"# https://docs.hdfgroup.org/hdf5/develop/group___h5_g.html\n", | |
"# see also https://forum.hdfgroup.org/t/group-overhead-file-size-problem-for-hierarchical-data/9640\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "dev", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment