kavvkon/test_hdf.ipynb

## test_hdf.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2000-01-01</th>\n",
       "      <td>0.952466</td>\n",
       "      <td>0.630784</td>\n",
       "      <td>0.985323</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-02</th>\n",
       "      <td>0.198525</td>\n",
       "      <td>0.526136</td>\n",
       "      <td>0.086640</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-03</th>\n",
       "      <td>0.189863</td>\n",
       "      <td>0.559567</td>\n",
       "      <td>0.771268</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-04</th>\n",
       "      <td>0.236881</td>\n",
       "      <td>0.203953</td>\n",
       "      <td>0.509006</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-05</th>\n",
       "      <td>0.324636</td>\n",
       "      <td>0.143960</td>\n",
       "      <td>0.103177</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   A         B         C\n",
       "2000-01-01  0.952466  0.630784  0.985323\n",
       "2000-01-02  0.198525  0.526136  0.086640\n",
       "2000-01-03  0.189863  0.559567  0.771268\n",
       "2000-01-04  0.236881  0.203953  0.509006\n",
       "2000-01-05  0.324636  0.143960  0.103177"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index=pd.DatetimeIndex(start='01/01/2000',end='01/05/2000',freq='d')\n",
    "a = np.random.rand(5,3)\n",
    "\n",
    "df = pd.DataFrame(a, columns=('A','B','C'),index=index)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "http://pandas.pydata.org/pandas-docs/stable/io.html#io-hdf5\n",
    "\n",
    "There are two ways to store data: \n",
    "* fixed format is not appendable not queryable but pretty fast\n",
    "* table format is shaped like a dataframe, you can append and query, you can read it from a HDF viewer but is slower"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.io.pytables.HDFStore'>\n",
      "File path: foo.h5\n",
      "/test1            frame_table  (typ->appendable,nrows->5,ncols->3,indexers->[index],dc->[A,B,C])\n",
      "/test2            frame        (shape->[5,3])                                                   \n",
      "/test3            frame        (shape->[5,3])                                                   \n",
      "/test4            frame_table  (typ->appendable,nrows->5,ncols->3,indexers->[index])            \n",
      "/test5            frame_table  (typ->appendable,nrows->5,ncols->3,indexers->[index],dc->[A,B,C])\n"
     ]
    }
   ],
   "source": [
    "store = pd.HDFStore('foo.h5')\n",
    "store.append('test1', df, data_columns = df.columns)\n",
    "store.put('test2', df, data_columns = df.columns)\n",
    "df.to_hdf(store, 'test3')  # default format='fixed'\n",
    "df.to_hdf(store, 'test4', format='table') #dont use as it merges all data_columns\n",
    "df.to_hdf(store, 'test5', format='table', data_columns=True)\n",
    "\n",
    "print store\n",
    "store.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2000-01-01</th>\n",
       "      <td>0.952466</td>\n",
       "      <td>0.630784</td>\n",
       "      <td>0.985323</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-02</th>\n",
       "      <td>0.198525</td>\n",
       "      <td>0.526136</td>\n",
       "      <td>0.086640</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-03</th>\n",
       "      <td>0.189863</td>\n",
       "      <td>0.559567</td>\n",
       "      <td>0.771268</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-04</th>\n",
       "      <td>0.236881</td>\n",
       "      <td>0.203953</td>\n",
       "      <td>0.509006</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-05</th>\n",
       "      <td>0.324636</td>\n",
       "      <td>0.143960</td>\n",
       "      <td>0.103177</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   A         B         C\n",
       "2000-01-01  0.952466  0.630784  0.985323\n",
       "2000-01-02  0.198525  0.526136  0.086640\n",
       "2000-01-03  0.189863  0.559567  0.771268\n",
       "2000-01-04  0.236881  0.203953  0.509006\n",
       "2000-01-05  0.324636  0.143960  0.103177"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df5 = pd.read_hdf('foo.h5', key='test5')\n",
    "df5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2000-01-01</th>\n",
       "      <td>0.952466</td>\n",
       "      <td>0.630784</td>\n",
       "      <td>0.985323</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-02</th>\n",
       "      <td>0.198525</td>\n",
       "      <td>0.526136</td>\n",
       "      <td>0.086640</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-03</th>\n",
       "      <td>0.189863</td>\n",
       "      <td>0.559567</td>\n",
       "      <td>0.771268</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-04</th>\n",
       "      <td>0.236881</td>\n",
       "      <td>0.203953</td>\n",
       "      <td>0.509006</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-05</th>\n",
       "      <td>0.324636</td>\n",
       "      <td>0.143960</td>\n",
       "      <td>0.103177</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   A         B         C\n",
       "2000-01-01  0.952466  0.630784  0.985323\n",
       "2000-01-02  0.198525  0.526136  0.086640\n",
       "2000-01-03  0.189863  0.559567  0.771268\n",
       "2000-01-04  0.236881  0.203953  0.509006\n",
       "2000-01-05  0.324636  0.143960  0.103177"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2 = pd.read_hdf('foo.h5', key='test2')\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2000-01-01</th>\n",
       "      <td>0.952466</td>\n",
       "      <td>0.630784</td>\n",
       "      <td>0.985323</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   A         B         C\n",
       "2000-01-01  0.952466  0.630784  0.985323"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df5 = pd.read_hdf('foo.h5', key='test5', where=['A>0.5'])\n",
    "df5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cannot pass a where specification when reading from a Fixed format store. this store must be selected in its entirety\n"
     ]
    }
   ],
   "source": [
    "#raises error as you cannot query fixed format\n",
    "\n",
    "try:\n",
    "    df2 = pd.read_hdf('foo.h5', key='test2', where=['A>0.5'])\n",
    "except Exception as e:\n",
    "    print e"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`h5py` library is more pure low level store/write library"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<HDF5 file \"foo.h5\" (mode r+)>\n",
      "[<HDF5 group \"/test1\" (2 members)>, <HDF5 group \"/test2\" (4 members)>, <HDF5 group \"/test3\" (4 members)>, <HDF5 group \"/test4\" (2 members)>, <HDF5 group \"/test5\" (2 members)>]\n"
     ]
    }
   ],
   "source": [
    "import h5py\n",
    "f = h5py.File(\"foo.h5\")\n",
    "print f\n",
    "print f.values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "f['test6'] = a\n",
    "f.attrs['test6'] = 'cool array' #store attributes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0.95246551  0.63078389  0.98532322]\n",
      " [ 0.19852491  0.526136    0.08663984]\n",
      " [ 0.18986281  0.55956696  0.77126847]\n",
      " [ 0.23688076  0.20395321  0.50900627]\n",
      " [ 0.32463634  0.14395971  0.10317721]]\n"
     ]
    }
   ],
   "source": [
    "print f['test6'][...]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2000-01-01</th>\n",
       "      <td>0.952466</td>\n",
       "      <td>0.630784</td>\n",
       "      <td>0.985323</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-02</th>\n",
       "      <td>0.198525</td>\n",
       "      <td>0.526136</td>\n",
       "      <td>0.086640</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-03</th>\n",
       "      <td>0.189863</td>\n",
       "      <td>0.559567</td>\n",
       "      <td>0.771268</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-04</th>\n",
       "      <td>0.236881</td>\n",
       "      <td>0.203953</td>\n",
       "      <td>0.509006</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-05</th>\n",
       "      <td>0.324636</td>\n",
       "      <td>0.143960</td>\n",
       "      <td>0.103177</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   0         1         2\n",
       "2000-01-01  0.952466  0.630784  0.985323\n",
       "2000-01-02  0.198525  0.526136  0.086640\n",
       "2000-01-03  0.189863  0.559567  0.771268\n",
       "2000-01-04  0.236881  0.203953  0.509006\n",
       "2000-01-05  0.324636  0.143960  0.103177"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(f['test6'][...], index=index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.95246551,  0.63078389,  0.98532322])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "f['test6'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.95246550876814351"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "f['test6'][0,0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test1\n",
      "test2\n",
      "test3\n",
      "test4\n",
      "test5\n",
      "test6\n"
     ]
    }
   ],
   "source": [
    "for name in f:\n",
    "    print name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(u'axis0', <HDF5 dataset \"axis0\": shape (3,), type \"|S1\">),\n",
       " (u'axis1', <HDF5 dataset \"axis1\": shape (5,), type \"<i8\">),\n",
       " (u'block0_items', <HDF5 dataset \"block0_items\": shape (3,), type \"|S1\">),\n",
       " (u'block0_values', <HDF5 dataset \"block0_values\": shape (5, 3), type \"<f8\">)]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#cannot really parse data stored with pd.to_hdf method as they are stored in a custom format with a lot of metadata\n",
    "f['test2'].items()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "f = h5py.File(\"foo.h5\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "f['test7'] = a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "f['test7'].dims[0].label = 'time'\n",
    "f['test7'].dims[1].label = 'y'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([946684800000000000, 946771200000000000, 946857600000000000,\n",
       "       946944000000000000, 947030400000000000], dtype=int64)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index.values.astype('<i8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "f['index7'] = index.values.astype('<i8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "f['test7'].dims.create_scale(f['index7'])\n",
    "f['test7'].dims[0].attach_scale(f['index7'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[u'time', u'y']"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[dim.label for dim in f['test7'].dims]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[u'']"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "f['test7'].dims[0].keys()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3d dimension scales \n",
    "http://docs.h5py.org/en/latest/high/dims.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "f['test8'] = np.ones((4, 3, 2), 'f')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "f['test8'].dims[0].label = 'z'\n",
    "f['test8'].dims[2].label = 'x'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "f['x1'] = [1, 2]\n",
    "f['x2'] = [1, 1.1]\n",
    "f['y1'] = [0, 1, 2]\n",
    "f['z1'] = [0, 1, 4, 9]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "f['test8'].dims.create_scale(f['x1'])\n",
    "f['test8'].dims.create_scale(f['x2'], 'x2 name')\n",
    "f['test8'].dims.create_scale(f['y1'], 'y1 name')\n",
    "f['test8'].dims.create_scale(f['z1'], 'z1 name')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "f['test8'].dims[0].attach_scale(f['z1'])\n",
    "f['test8'].dims[1].attach_scale(f['y1'])\n",
    "f['test8'].dims[2].attach_scale(f['x1'])\n",
    "f['test8'].dims[2].attach_scale(f['x2'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[u'z', u'', u'x']"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[dim.label for dim in f['test8'].dims]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[u'', u'x2 name']"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "f['test8'].dims[2].keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<HDF5 dataset \"x2\": shape (2,), type \"<f8\">"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "f['test8'].dims[2]['x2 name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "f['test8'].dims[2]['x2 name'] == f['x2']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>A</th>\n",
	" <th>B</th>\n",
	" <th>C</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>2000-01-01</th>\n",
	" <td>0.952466</td>\n",
	" <td>0.630784</td>\n",
	" <td>0.985323</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-02</th>\n",
	" <td>0.198525</td>\n",
	" <td>0.526136</td>\n",
	" <td>0.086640</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-03</th>\n",
	" <td>0.189863</td>\n",
	" <td>0.559567</td>\n",
	" <td>0.771268</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-04</th>\n",
	" <td>0.236881</td>\n",
	" <td>0.203953</td>\n",
	" <td>0.509006</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-05</th>\n",
	" <td>0.324636</td>\n",
	" <td>0.143960</td>\n",
	" <td>0.103177</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" A B C\n",
	"2000-01-01 0.952466 0.630784 0.985323\n",
	"2000-01-02 0.198525 0.526136 0.086640\n",
	"2000-01-03 0.189863 0.559567 0.771268\n",
	"2000-01-04 0.236881 0.203953 0.509006\n",
	"2000-01-05 0.324636 0.143960 0.103177"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"index=pd.DatetimeIndex(start='01/01/2000',end='01/05/2000',freq='d')\n",
	"a = np.random.rand(5,3)\n",
	"\n",
	"df = pd.DataFrame(a, columns=('A','B','C'),index=index)\n",
	"df"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"http://pandas.pydata.org/pandas-docs/stable/io.html#io-hdf5\n",
	"\n",
	"There are two ways to store data: \n",
	"* fixed format is not appendable not queryable but pretty fast\n",
	"* table format is shaped like a dataframe, you can append and query, you can read it from a HDF viewer but is slower"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"<class 'pandas.io.pytables.HDFStore'>\n",
	"File path: foo.h5\n",
	"/test1 frame_table (typ->appendable,nrows->5,ncols->3,indexers->[index],dc->[A,B,C])\n",
	"/test2 frame (shape->[5,3]) \n",
	"/test3 frame (shape->[5,3]) \n",
	"/test4 frame_table (typ->appendable,nrows->5,ncols->3,indexers->[index]) \n",
	"/test5 frame_table (typ->appendable,nrows->5,ncols->3,indexers->[index],dc->[A,B,C])\n"
	]
	}
	],
	"source": [
	"store = pd.HDFStore('foo.h5')\n",
	"store.append('test1', df, data_columns = df.columns)\n",
	"store.put('test2', df, data_columns = df.columns)\n",
	"df.to_hdf(store, 'test3') # default format='fixed'\n",
	"df.to_hdf(store, 'test4', format='table') #dont use as it merges all data_columns\n",
	"df.to_hdf(store, 'test5', format='table', data_columns=True)\n",
	"\n",
	"print store\n",
	"store.close()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>A</th>\n",
	" <th>B</th>\n",
	" <th>C</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>2000-01-01</th>\n",
	" <td>0.952466</td>\n",
	" <td>0.630784</td>\n",
	" <td>0.985323</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-02</th>\n",
	" <td>0.198525</td>\n",
	" <td>0.526136</td>\n",
	" <td>0.086640</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-03</th>\n",
	" <td>0.189863</td>\n",
	" <td>0.559567</td>\n",
	" <td>0.771268</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-04</th>\n",
	" <td>0.236881</td>\n",
	" <td>0.203953</td>\n",
	" <td>0.509006</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-05</th>\n",
	" <td>0.324636</td>\n",
	" <td>0.143960</td>\n",
	" <td>0.103177</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" A B C\n",
	"2000-01-01 0.952466 0.630784 0.985323\n",
	"2000-01-02 0.198525 0.526136 0.086640\n",
	"2000-01-03 0.189863 0.559567 0.771268\n",
	"2000-01-04 0.236881 0.203953 0.509006\n",
	"2000-01-05 0.324636 0.143960 0.103177"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df5 = pd.read_hdf('foo.h5', key='test5')\n",
	"df5"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>A</th>\n",
	" <th>B</th>\n",
	" <th>C</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>2000-01-01</th>\n",
	" <td>0.952466</td>\n",
	" <td>0.630784</td>\n",
	" <td>0.985323</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-02</th>\n",
	" <td>0.198525</td>\n",
	" <td>0.526136</td>\n",
	" <td>0.086640</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-03</th>\n",
	" <td>0.189863</td>\n",
	" <td>0.559567</td>\n",
	" <td>0.771268</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-04</th>\n",
	" <td>0.236881</td>\n",
	" <td>0.203953</td>\n",
	" <td>0.509006</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-05</th>\n",
	" <td>0.324636</td>\n",
	" <td>0.143960</td>\n",
	" <td>0.103177</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" A B C\n",
	"2000-01-01 0.952466 0.630784 0.985323\n",
	"2000-01-02 0.198525 0.526136 0.086640\n",
	"2000-01-03 0.189863 0.559567 0.771268\n",
	"2000-01-04 0.236881 0.203953 0.509006\n",
	"2000-01-05 0.324636 0.143960 0.103177"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df2 = pd.read_hdf('foo.h5', key='test2')\n",
	"df2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>A</th>\n",
	" <th>B</th>\n",
	" <th>C</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>2000-01-01</th>\n",
	" <td>0.952466</td>\n",
	" <td>0.630784</td>\n",
	" <td>0.985323</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" A B C\n",
	"2000-01-01 0.952466 0.630784 0.985323"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df5 = pd.read_hdf('foo.h5', key='test5', where=['A>0.5'])\n",
	"df5"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"cannot pass a where specification when reading from a Fixed format store. this store must be selected in its entirety\n"
	]
	}
	],
	"source": [
	"#raises error as you cannot query fixed format\n",
	"\n",
	"try:\n",
	" df2 = pd.read_hdf('foo.h5', key='test2', where=['A>0.5'])\n",
	"except Exception as e:\n",
	" print e"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"`h5py` library is more pure low level store/write library"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"<HDF5 file \"foo.h5\" (mode r+)>\n",
	"[<HDF5 group \"/test1\" (2 members)>, <HDF5 group \"/test2\" (4 members)>, <HDF5 group \"/test3\" (4 members)>, <HDF5 group \"/test4\" (2 members)>, <HDF5 group \"/test5\" (2 members)>]\n"
	]
	}
	],
	"source": [
	"import h5py\n",
	"f = h5py.File(\"foo.h5\")\n",
	"print f\n",
	"print f.values()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"f['test6'] = a\n",
	"f.attrs['test6'] = 'cool array' #store attributes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"scrolled": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[[ 0.95246551 0.63078389 0.98532322]\n",
	" [ 0.19852491 0.526136 0.08663984]\n",
	" [ 0.18986281 0.55956696 0.77126847]\n",
	" [ 0.23688076 0.20395321 0.50900627]\n",
	" [ 0.32463634 0.14395971 0.10317721]]\n"
	]
	}
	],
	"source": [
	"print f['test6'][...]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>0</th>\n",
	" <th>1</th>\n",
	" <th>2</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>2000-01-01</th>\n",
	" <td>0.952466</td>\n",
	" <td>0.630784</td>\n",
	" <td>0.985323</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-02</th>\n",
	" <td>0.198525</td>\n",
	" <td>0.526136</td>\n",
	" <td>0.086640</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-03</th>\n",
	" <td>0.189863</td>\n",
	" <td>0.559567</td>\n",
	" <td>0.771268</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-04</th>\n",
	" <td>0.236881</td>\n",
	" <td>0.203953</td>\n",
	" <td>0.509006</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-05</th>\n",
	" <td>0.324636</td>\n",
	" <td>0.143960</td>\n",
	" <td>0.103177</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" 0 1 2\n",
	"2000-01-01 0.952466 0.630784 0.985323\n",
	"2000-01-02 0.198525 0.526136 0.086640\n",
	"2000-01-03 0.189863 0.559567 0.771268\n",
	"2000-01-04 0.236881 0.203953 0.509006\n",
	"2000-01-05 0.324636 0.143960 0.103177"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pd.DataFrame(f['test6'][...], index=index)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([ 0.95246551, 0.63078389, 0.98532322])"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"f['test6'][0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.95246550876814351"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"f['test6'][0,0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"scrolled": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"test1\n",
	"test2\n",
	"test3\n",
	"test4\n",
	"test5\n",
	"test6\n"
	]
	}
	],
	"source": [
	"for name in f:\n",
	" print name"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[(u'axis0', <HDF5 dataset \"axis0\": shape (3,), type \"\|S1\">),\n",
	" (u'axis1', <HDF5 dataset \"axis1\": shape (5,), type \"<i8\">),\n",
	" (u'block0_items', <HDF5 dataset \"block0_items\": shape (3,), type \"\|S1\">),\n",
	" (u'block0_values', <HDF5 dataset \"block0_values\": shape (5, 3), type \"<f8\">)]"
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"#cannot really parse data stored with pd.to_hdf method as they are stored in a custom format with a lot of metadata\n",
	"f['test2'].items()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [],
	"source": [
	"f.close()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [],
	"source": [
	"f = h5py.File(\"foo.h5\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [],
	"source": [
	"f['test7'] = a"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [],
	"source": [
	"f['test7'].dims[0].label = 'time'\n",
	"f['test7'].dims[1].label = 'y'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([946684800000000000, 946771200000000000, 946857600000000000,\n",
	" 946944000000000000, 947030400000000000], dtype=int64)"
	]
	},
	"execution_count": 20,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"index.values.astype('<i8')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"f['index7'] = index.values.astype('<i8')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [],
	"source": [
	"f['test7'].dims.create_scale(f['index7'])\n",
	"f['test7'].dims[0].attach_scale(f['index7'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[u'time', u'y']"
	]
	},
	"execution_count": 23,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"[dim.label for dim in f['test7'].dims]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[u'']"
	]
	},
	"execution_count": 24,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"f['test7'].dims[0].keys()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 3d dimension scales \n",
	"http://docs.h5py.org/en/latest/high/dims.html"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"f['test8'] = np.ones((4, 3, 2), 'f')\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"f['test8'].dims[0].label = 'z'\n",
	"f['test8'].dims[2].label = 'x'\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"f['x1'] = [1, 2]\n",
	"f['x2'] = [1, 1.1]\n",
	"f['y1'] = [0, 1, 2]\n",
	"f['z1'] = [0, 1, 4, 9]\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {},
	"outputs": [],
	"source": [
	"f['test8'].dims.create_scale(f['x1'])\n",
	"f['test8'].dims.create_scale(f['x2'], 'x2 name')\n",
	"f['test8'].dims.create_scale(f['y1'], 'y1 name')\n",
	"f['test8'].dims.create_scale(f['z1'], 'z1 name')\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"f['test8'].dims[0].attach_scale(f['z1'])\n",
	"f['test8'].dims[1].attach_scale(f['y1'])\n",
	"f['test8'].dims[2].attach_scale(f['x1'])\n",
	"f['test8'].dims[2].attach_scale(f['x2'])\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[u'z', u'', u'x']"
	]
	},
	"execution_count": 30,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"[dim.label for dim in f['test8'].dims]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[u'', u'x2 name']"
	]
	},
	"execution_count": 31,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"f['test8'].dims[2].keys()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"<HDF5 dataset \"x2\": shape (2,), type \"<f8\">"
	]
	},
	"execution_count": 32,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"f['test8'].dims[2]['x2 name']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 33,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"f['test8'].dims[2]['x2 name'] == f['x2']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 34,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"f.close()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.14"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}