maartenbreddels/h5map.ipynb

## h5map.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For continuous arrays there is no need to use the hdf5 library, once the offset, shape and dtype is know, we can close the hdf5 file, open it ourselves and mmap the data. This gives zero overhead in reading the dat."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import h5py\n",
    "path = '/Users/users/breddels/.vaex/data/helmi-dezeeuw-2000-10p.hdf5'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "h5file = h5py.File(path, 'r')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data <HDF5 group \"/data\" (11 members)> group\n",
      "data/E <HDF5 dataset \"E\": shape (330000,), type \"<f8\"> dataset\n",
      "data/FeH <HDF5 dataset \"FeH\": shape (330000,), type \"<f8\"> dataset\n",
      "data/L <HDF5 dataset \"L\": shape (330000,), type \"<f8\"> dataset\n",
      "data/Lz <HDF5 dataset \"Lz\": shape (330000,), type \"<f8\"> dataset\n",
      "data/random_index <HDF5 dataset \"random_index\": shape (330000,), type \"<i8\"> dataset\n",
      "data/vx <HDF5 dataset \"vx\": shape (330000,), type \"<f8\"> dataset\n",
      "data/vy <HDF5 dataset \"vy\": shape (330000,), type \"<f8\"> dataset\n",
      "data/vz <HDF5 dataset \"vz\": shape (330000,), type \"<f8\"> dataset\n",
      "data/x <HDF5 dataset \"x\": shape (330000,), type \"<f8\"> dataset\n",
      "data/y <HDF5 dataset \"y\": shape (330000,), type \"<f8\"> dataset\n",
      "data/z <HDF5 dataset \"z\": shape (330000,), type \"<f8\"> dataset\n"
     ]
    }
   ],
   "source": [
    "# maps from hdf5 path to metadata\n",
    "arrays_metadata = {}\n",
    "def f(name, item):\n",
    "    is_dataset = isinstance(item, h5py.Dataset)\n",
    "    print(name, item, 'dataset' if is_dataset else 'group')\n",
    "    if is_dataset:\n",
    "        offset = item.id.get_offset()\n",
    "        if offset is not None:\n",
    "            arrays_metadata[name] = dict(offset=offset, shape=item.shape, dtype=item.dtype)\n",
    "        else:\n",
    "            print('could not get offset, probably not a continuous array')\n",
    "h5file.visititems(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# no need to keep this open\n",
    "h5file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import mmap\n",
    "import numpy as np\n",
    "file = open(path, \"rb\")\n",
    "fileno = file.fileno()\n",
    "mapping = mmap.mmap(fileno, 0, access=mmap.ACCESS_READ)\n",
    "def to_array(metadata):\n",
    "    shape = metadata['shape']\n",
    "    dtype = metadata['dtype']\n",
    "    offset = metadata['offset']\n",
    "    length = np.prod(shape)\n",
    "    return np.frombuffer(mapping, dtype=dtype, count=length, offset=offset).reshape(shape)\n",
    "# map the metadata to a numpy array\n",
    "arrays = {name:to_array(metadata) for name, metadata in arrays_metadata.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'data/E': array([-121238.171875 , -100819.9140625, -100559.9609375, ...,\n",
       "        -112580.359375 ,  -74862.90625  ,  -95361.765625 ]),\n",
       " 'data/FeH': array([-2.30922761, -1.78873549, -0.7618109 , ..., -1.93062276,\n",
       "        -1.22501982, -2.56896369]),\n",
       " 'data/L': array([  831.07995605,  1435.18395996,  1039.2989502 , ...,  1182.4362793 ,\n",
       "         1324.59265137,   351.09555054]),\n",
       " 'data/Lz': array([ -336.42651367,  -828.7567749 ,   920.80249023, ...,   115.58557892,\n",
       "         1057.01733398,  -309.81439209]),\n",
       " 'data/random_index': array([1511648, 2728665, 1202632, ...,  374845,  425745,  289364]),\n",
       " 'data/vx': array([  53.276722  ,  252.810791  ,   96.276474  , ...,    8.46711349,\n",
       "         110.221558  ,   -2.10541415]),\n",
       " 'data/vy': array([ 288.386047 ,  -69.9498444,  226.440201 , ...,  -38.2765236,\n",
       "         -31.3925591,  -27.6108856]),\n",
       " 'data/vz': array([ -95.2649078 ,  -56.3121033 ,  -34.7527161 , ..., -127.541473  ,\n",
       "          86.2726822 ,    3.80799961]),\n",
       " 'data/x': array([ -0.77747077,   3.77427316,   1.3757627 , ...,  -1.14041007,\n",
       "        -14.2985935 ,  10.5450506 ]),\n",
       " 'data/y': array([ 2.10626292,  2.23387194, -6.3283844 , ..., -8.4957695 ,\n",
       "        -5.51750422, -8.86106777]),\n",
       " 'data/z': array([ 1.93743467,  3.76209331,  2.63250017, ...,  2.25749826,\n",
       "        -8.65472317, -4.65835428])}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "arrays"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"For continuous arrays there is no need to use the hdf5 library, once the offset, shape and dtype is know, we can close the hdf5 file, open it ourselves and mmap the data. This gives zero overhead in reading the dat."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import h5py\n",
	"path = '/Users/users/breddels/.vaex/data/helmi-dezeeuw-2000-10p.hdf5'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"h5file = h5py.File(path, 'r')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"data <HDF5 group \"/data\" (11 members)> group\n",
	"data/E <HDF5 dataset \"E\": shape (330000,), type \"<f8\"> dataset\n",
	"data/FeH <HDF5 dataset \"FeH\": shape (330000,), type \"<f8\"> dataset\n",
	"data/L <HDF5 dataset \"L\": shape (330000,), type \"<f8\"> dataset\n",
	"data/Lz <HDF5 dataset \"Lz\": shape (330000,), type \"<f8\"> dataset\n",
	"data/random_index <HDF5 dataset \"random_index\": shape (330000,), type \"<i8\"> dataset\n",
	"data/vx <HDF5 dataset \"vx\": shape (330000,), type \"<f8\"> dataset\n",
	"data/vy <HDF5 dataset \"vy\": shape (330000,), type \"<f8\"> dataset\n",
	"data/vz <HDF5 dataset \"vz\": shape (330000,), type \"<f8\"> dataset\n",
	"data/x <HDF5 dataset \"x\": shape (330000,), type \"<f8\"> dataset\n",
	"data/y <HDF5 dataset \"y\": shape (330000,), type \"<f8\"> dataset\n",
	"data/z <HDF5 dataset \"z\": shape (330000,), type \"<f8\"> dataset\n"
	]
	}
	],
	"source": [
	"# maps from hdf5 path to metadata\n",
	"arrays_metadata = {}\n",
	"def f(name, item):\n",
	" is_dataset = isinstance(item, h5py.Dataset)\n",
	" print(name, item, 'dataset' if is_dataset else 'group')\n",
	" if is_dataset:\n",
	" offset = item.id.get_offset()\n",
	" if offset is not None:\n",
	" arrays_metadata[name] = dict(offset=offset, shape=item.shape, dtype=item.dtype)\n",
	" else:\n",
	" print('could not get offset, probably not a continuous array')\n",
	"h5file.visititems(f)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# no need to keep this open\n",
	"h5file.close()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import mmap\n",
	"import numpy as np\n",
	"file = open(path, \"rb\")\n",
	"fileno = file.fileno()\n",
	"mapping = mmap.mmap(fileno, 0, access=mmap.ACCESS_READ)\n",
	"def to_array(metadata):\n",
	" shape = metadata['shape']\n",
	" dtype = metadata['dtype']\n",
	" offset = metadata['offset']\n",
	" length = np.prod(shape)\n",
	" return np.frombuffer(mapping, dtype=dtype, count=length, offset=offset).reshape(shape)\n",
	"# map the metadata to a numpy array\n",
	"arrays = {name:to_array(metadata) for name, metadata in arrays_metadata.items()}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'data/E': array([-121238.171875 , -100819.9140625, -100559.9609375, ...,\n",
	" -112580.359375 , -74862.90625 , -95361.765625 ]),\n",
	" 'data/FeH': array([-2.30922761, -1.78873549, -0.7618109 , ..., -1.93062276,\n",
	" -1.22501982, -2.56896369]),\n",
	" 'data/L': array([ 831.07995605, 1435.18395996, 1039.2989502 , ..., 1182.4362793 ,\n",
	" 1324.59265137, 351.09555054]),\n",
	" 'data/Lz': array([ -336.42651367, -828.7567749 , 920.80249023, ..., 115.58557892,\n",
	" 1057.01733398, -309.81439209]),\n",
	" 'data/random_index': array([1511648, 2728665, 1202632, ..., 374845, 425745, 289364]),\n",
	" 'data/vx': array([ 53.276722 , 252.810791 , 96.276474 , ..., 8.46711349,\n",
	" 110.221558 , -2.10541415]),\n",
	" 'data/vy': array([ 288.386047 , -69.9498444, 226.440201 , ..., -38.2765236,\n",
	" -31.3925591, -27.6108856]),\n",
	" 'data/vz': array([ -95.2649078 , -56.3121033 , -34.7527161 , ..., -127.541473 ,\n",
	" 86.2726822 , 3.80799961]),\n",
	" 'data/x': array([ -0.77747077, 3.77427316, 1.3757627 , ..., -1.14041007,\n",
	" -14.2985935 , 10.5450506 ]),\n",
	" 'data/y': array([ 2.10626292, 2.23387194, -6.3283844 , ..., -8.4957695 ,\n",
	" -5.51750422, -8.86106777]),\n",
	" 'data/z': array([ 1.93743467, 3.76209331, 2.63250017, ..., 2.25749826,\n",
	" -8.65472317, -4.65835428])}"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"arrays"
	]
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python [default]",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}