Skip to content

Instantly share code, notes, and snippets.

@kavvkon
Created February 26, 2018 11:13
Show Gist options
  • Save kavvkon/47ad3d1b03b5ac963c25f8f865fd41e0 to your computer and use it in GitHub Desktop.
Save kavvkon/47ad3d1b03b5ac963c25f8f865fd41e0 to your computer and use it in GitHub Desktop.
HDF writing and reading examples
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2000-01-01</th>\n",
" <td>0.952466</td>\n",
" <td>0.630784</td>\n",
" <td>0.985323</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2000-01-02</th>\n",
" <td>0.198525</td>\n",
" <td>0.526136</td>\n",
" <td>0.086640</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2000-01-03</th>\n",
" <td>0.189863</td>\n",
" <td>0.559567</td>\n",
" <td>0.771268</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2000-01-04</th>\n",
" <td>0.236881</td>\n",
" <td>0.203953</td>\n",
" <td>0.509006</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2000-01-05</th>\n",
" <td>0.324636</td>\n",
" <td>0.143960</td>\n",
" <td>0.103177</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C\n",
"2000-01-01 0.952466 0.630784 0.985323\n",
"2000-01-02 0.198525 0.526136 0.086640\n",
"2000-01-03 0.189863 0.559567 0.771268\n",
"2000-01-04 0.236881 0.203953 0.509006\n",
"2000-01-05 0.324636 0.143960 0.103177"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"index=pd.DatetimeIndex(start='01/01/2000',end='01/05/2000',freq='d')\n",
"a = np.random.rand(5,3)\n",
"\n",
"df = pd.DataFrame(a, columns=('A','B','C'),index=index)\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"http://pandas.pydata.org/pandas-docs/stable/io.html#io-hdf5\n",
"\n",
"There are two ways to store data: \n",
"* fixed format is not appendable not queryable but pretty fast\n",
"* table format is shaped like a dataframe, you can append and query, you can read it from a HDF viewer but is slower"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.io.pytables.HDFStore'>\n",
"File path: foo.h5\n",
"/test1 frame_table (typ->appendable,nrows->5,ncols->3,indexers->[index],dc->[A,B,C])\n",
"/test2 frame (shape->[5,3]) \n",
"/test3 frame (shape->[5,3]) \n",
"/test4 frame_table (typ->appendable,nrows->5,ncols->3,indexers->[index]) \n",
"/test5 frame_table (typ->appendable,nrows->5,ncols->3,indexers->[index],dc->[A,B,C])\n"
]
}
],
"source": [
"store = pd.HDFStore('foo.h5')\n",
"store.append('test1', df, data_columns = df.columns)\n",
"store.put('test2', df, data_columns = df.columns)\n",
"df.to_hdf(store, 'test3') # default format='fixed'\n",
"df.to_hdf(store, 'test4', format='table') #dont use as it merges all data_columns\n",
"df.to_hdf(store, 'test5', format='table', data_columns=True)\n",
"\n",
"print store\n",
"store.close()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2000-01-01</th>\n",
" <td>0.952466</td>\n",
" <td>0.630784</td>\n",
" <td>0.985323</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2000-01-02</th>\n",
" <td>0.198525</td>\n",
" <td>0.526136</td>\n",
" <td>0.086640</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2000-01-03</th>\n",
" <td>0.189863</td>\n",
" <td>0.559567</td>\n",
" <td>0.771268</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2000-01-04</th>\n",
" <td>0.236881</td>\n",
" <td>0.203953</td>\n",
" <td>0.509006</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2000-01-05</th>\n",
" <td>0.324636</td>\n",
" <td>0.143960</td>\n",
" <td>0.103177</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C\n",
"2000-01-01 0.952466 0.630784 0.985323\n",
"2000-01-02 0.198525 0.526136 0.086640\n",
"2000-01-03 0.189863 0.559567 0.771268\n",
"2000-01-04 0.236881 0.203953 0.509006\n",
"2000-01-05 0.324636 0.143960 0.103177"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df5 = pd.read_hdf('foo.h5', key='test5')\n",
"df5"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2000-01-01</th>\n",
" <td>0.952466</td>\n",
" <td>0.630784</td>\n",
" <td>0.985323</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2000-01-02</th>\n",
" <td>0.198525</td>\n",
" <td>0.526136</td>\n",
" <td>0.086640</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2000-01-03</th>\n",
" <td>0.189863</td>\n",
" <td>0.559567</td>\n",
" <td>0.771268</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2000-01-04</th>\n",
" <td>0.236881</td>\n",
" <td>0.203953</td>\n",
" <td>0.509006</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2000-01-05</th>\n",
" <td>0.324636</td>\n",
" <td>0.143960</td>\n",
" <td>0.103177</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C\n",
"2000-01-01 0.952466 0.630784 0.985323\n",
"2000-01-02 0.198525 0.526136 0.086640\n",
"2000-01-03 0.189863 0.559567 0.771268\n",
"2000-01-04 0.236881 0.203953 0.509006\n",
"2000-01-05 0.324636 0.143960 0.103177"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2 = pd.read_hdf('foo.h5', key='test2')\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2000-01-01</th>\n",
" <td>0.952466</td>\n",
" <td>0.630784</td>\n",
" <td>0.985323</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C\n",
"2000-01-01 0.952466 0.630784 0.985323"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df5 = pd.read_hdf('foo.h5', key='test5', where=['A>0.5'])\n",
"df5"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cannot pass a where specification when reading from a Fixed format store. this store must be selected in its entirety\n"
]
}
],
"source": [
"#raises error as you cannot query fixed format\n",
"\n",
"try:\n",
" df2 = pd.read_hdf('foo.h5', key='test2', where=['A>0.5'])\n",
"except Exception as e:\n",
" print e"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`h5py` library is more pure low level store/write library"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<HDF5 file \"foo.h5\" (mode r+)>\n",
"[<HDF5 group \"/test1\" (2 members)>, <HDF5 group \"/test2\" (4 members)>, <HDF5 group \"/test3\" (4 members)>, <HDF5 group \"/test4\" (2 members)>, <HDF5 group \"/test5\" (2 members)>]\n"
]
}
],
"source": [
"import h5py\n",
"f = h5py.File(\"foo.h5\")\n",
"print f\n",
"print f.values()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"f['test6'] = a\n",
"f.attrs['test6'] = 'cool array' #store attributes"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 0.95246551 0.63078389 0.98532322]\n",
" [ 0.19852491 0.526136 0.08663984]\n",
" [ 0.18986281 0.55956696 0.77126847]\n",
" [ 0.23688076 0.20395321 0.50900627]\n",
" [ 0.32463634 0.14395971 0.10317721]]\n"
]
}
],
"source": [
"print f['test6'][...]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2000-01-01</th>\n",
" <td>0.952466</td>\n",
" <td>0.630784</td>\n",
" <td>0.985323</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2000-01-02</th>\n",
" <td>0.198525</td>\n",
" <td>0.526136</td>\n",
" <td>0.086640</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2000-01-03</th>\n",
" <td>0.189863</td>\n",
" <td>0.559567</td>\n",
" <td>0.771268</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2000-01-04</th>\n",
" <td>0.236881</td>\n",
" <td>0.203953</td>\n",
" <td>0.509006</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2000-01-05</th>\n",
" <td>0.324636</td>\n",
" <td>0.143960</td>\n",
" <td>0.103177</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 1 2\n",
"2000-01-01 0.952466 0.630784 0.985323\n",
"2000-01-02 0.198525 0.526136 0.086640\n",
"2000-01-03 0.189863 0.559567 0.771268\n",
"2000-01-04 0.236881 0.203953 0.509006\n",
"2000-01-05 0.324636 0.143960 0.103177"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(f['test6'][...], index=index)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0.95246551, 0.63078389, 0.98532322])"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f['test6'][0]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.95246550876814351"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f['test6'][0,0]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"test1\n",
"test2\n",
"test3\n",
"test4\n",
"test5\n",
"test6\n"
]
}
],
"source": [
"for name in f:\n",
" print name"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"[(u'axis0', <HDF5 dataset \"axis0\": shape (3,), type \"|S1\">),\n",
" (u'axis1', <HDF5 dataset \"axis1\": shape (5,), type \"<i8\">),\n",
" (u'block0_items', <HDF5 dataset \"block0_items\": shape (3,), type \"|S1\">),\n",
" (u'block0_values', <HDF5 dataset \"block0_values\": shape (5, 3), type \"<f8\">)]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#cannot really parse data stored with pd.to_hdf method as they are stored in a custom format with a lot of metadata\n",
"f['test2'].items()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"f = h5py.File(\"foo.h5\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"f['test7'] = a"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"f['test7'].dims[0].label = 'time'\n",
"f['test7'].dims[1].label = 'y'"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([946684800000000000, 946771200000000000, 946857600000000000,\n",
" 946944000000000000, 947030400000000000], dtype=int64)"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"index.values.astype('<i8')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"f['index7'] = index.values.astype('<i8')"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"f['test7'].dims.create_scale(f['index7'])\n",
"f['test7'].dims[0].attach_scale(f['index7'])"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[u'time', u'y']"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[dim.label for dim in f['test7'].dims]"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[u'']"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f['test7'].dims[0].keys()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3d dimension scales \n",
"http://docs.h5py.org/en/latest/high/dims.html"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"f['test8'] = np.ones((4, 3, 2), 'f')\n"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"f['test8'].dims[0].label = 'z'\n",
"f['test8'].dims[2].label = 'x'\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"f['x1'] = [1, 2]\n",
"f['x2'] = [1, 1.1]\n",
"f['y1'] = [0, 1, 2]\n",
"f['z1'] = [0, 1, 4, 9]\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"f['test8'].dims.create_scale(f['x1'])\n",
"f['test8'].dims.create_scale(f['x2'], 'x2 name')\n",
"f['test8'].dims.create_scale(f['y1'], 'y1 name')\n",
"f['test8'].dims.create_scale(f['z1'], 'z1 name')\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"f['test8'].dims[0].attach_scale(f['z1'])\n",
"f['test8'].dims[1].attach_scale(f['y1'])\n",
"f['test8'].dims[2].attach_scale(f['x1'])\n",
"f['test8'].dims[2].attach_scale(f['x2'])\n"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[u'z', u'', u'x']"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[dim.label for dim in f['test8'].dims]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[u'', u'x2 name']"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f['test8'].dims[2].keys()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<HDF5 dataset \"x2\": shape (2,), type \"<f8\">"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f['test8'].dims[2]['x2 name']"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f['test8'].dims[2]['x2 name'] == f['x2']"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.14"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment