Skip to content

Instantly share code, notes, and snippets.

@maartenbreddels
Created October 25, 2017 12:25
Show Gist options
  • Save maartenbreddels/09e1da79577151e5f7fec660c209f06e to your computer and use it in GitHub Desktop.
Save maartenbreddels/09e1da79577151e5f7fec660c209f06e to your computer and use it in GitHub Desktop.
memory mapping hdf5 continuous data
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For continuous arrays there is no need to use the hdf5 library, once the offset, shape and dtype is know, we can close the hdf5 file, open it ourselves and mmap the data. This gives zero overhead in reading the dat."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import h5py\n",
"path = '/Users/users/breddels/.vaex/data/helmi-dezeeuw-2000-10p.hdf5'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"h5file = h5py.File(path, 'r')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"data <HDF5 group \"/data\" (11 members)> group\n",
"data/E <HDF5 dataset \"E\": shape (330000,), type \"<f8\"> dataset\n",
"data/FeH <HDF5 dataset \"FeH\": shape (330000,), type \"<f8\"> dataset\n",
"data/L <HDF5 dataset \"L\": shape (330000,), type \"<f8\"> dataset\n",
"data/Lz <HDF5 dataset \"Lz\": shape (330000,), type \"<f8\"> dataset\n",
"data/random_index <HDF5 dataset \"random_index\": shape (330000,), type \"<i8\"> dataset\n",
"data/vx <HDF5 dataset \"vx\": shape (330000,), type \"<f8\"> dataset\n",
"data/vy <HDF5 dataset \"vy\": shape (330000,), type \"<f8\"> dataset\n",
"data/vz <HDF5 dataset \"vz\": shape (330000,), type \"<f8\"> dataset\n",
"data/x <HDF5 dataset \"x\": shape (330000,), type \"<f8\"> dataset\n",
"data/y <HDF5 dataset \"y\": shape (330000,), type \"<f8\"> dataset\n",
"data/z <HDF5 dataset \"z\": shape (330000,), type \"<f8\"> dataset\n"
]
}
],
"source": [
"# maps from hdf5 path to metadata\n",
"arrays_metadata = {}\n",
"def f(name, item):\n",
" is_dataset = isinstance(item, h5py.Dataset)\n",
" print(name, item, 'dataset' if is_dataset else 'group')\n",
" if is_dataset:\n",
" offset = item.id.get_offset()\n",
" if offset is not None:\n",
" arrays_metadata[name] = dict(offset=offset, shape=item.shape, dtype=item.dtype)\n",
" else:\n",
" print('could not get offset, probably not a continuous array')\n",
"h5file.visititems(f)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# no need to keep this open\n",
"h5file.close()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import mmap\n",
"import numpy as np\n",
"file = open(path, \"rb\")\n",
"fileno = file.fileno()\n",
"mapping = mmap.mmap(fileno, 0, access=mmap.ACCESS_READ)\n",
"def to_array(metadata):\n",
" shape = metadata['shape']\n",
" dtype = metadata['dtype']\n",
" offset = metadata['offset']\n",
" length = np.prod(shape)\n",
" return np.frombuffer(mapping, dtype=dtype, count=length, offset=offset).reshape(shape)\n",
"# map the metadata to a numpy array\n",
"arrays = {name:to_array(metadata) for name, metadata in arrays_metadata.items()}"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'data/E': array([-121238.171875 , -100819.9140625, -100559.9609375, ...,\n",
" -112580.359375 , -74862.90625 , -95361.765625 ]),\n",
" 'data/FeH': array([-2.30922761, -1.78873549, -0.7618109 , ..., -1.93062276,\n",
" -1.22501982, -2.56896369]),\n",
" 'data/L': array([ 831.07995605, 1435.18395996, 1039.2989502 , ..., 1182.4362793 ,\n",
" 1324.59265137, 351.09555054]),\n",
" 'data/Lz': array([ -336.42651367, -828.7567749 , 920.80249023, ..., 115.58557892,\n",
" 1057.01733398, -309.81439209]),\n",
" 'data/random_index': array([1511648, 2728665, 1202632, ..., 374845, 425745, 289364]),\n",
" 'data/vx': array([ 53.276722 , 252.810791 , 96.276474 , ..., 8.46711349,\n",
" 110.221558 , -2.10541415]),\n",
" 'data/vy': array([ 288.386047 , -69.9498444, 226.440201 , ..., -38.2765236,\n",
" -31.3925591, -27.6108856]),\n",
" 'data/vz': array([ -95.2649078 , -56.3121033 , -34.7527161 , ..., -127.541473 ,\n",
" 86.2726822 , 3.80799961]),\n",
" 'data/x': array([ -0.77747077, 3.77427316, 1.3757627 , ..., -1.14041007,\n",
" -14.2985935 , 10.5450506 ]),\n",
" 'data/y': array([ 2.10626292, 2.23387194, -6.3283844 , ..., -8.4957695 ,\n",
" -5.51750422, -8.86106777]),\n",
" 'data/z': array([ 1.93743467, 3.76209331, 2.63250017, ..., 2.25749826,\n",
" -8.65472317, -4.65835428])}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"arrays"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment