michaelaye/store_tests.ipynb Secret

## store_tests.ipynb
{
 "metadata": {
  "name": "store tests"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "import pandas as pd",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "def getstore_and_print_table(fname):\n    import pprint\n    store = pd.HDFStore(fname)\n    pprint.pprint(store.get_storer('df').group.table)\n    return store",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "df = pd.DataFrame(randn(1e6,2),columns=list('AB'))",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "%%timeit\ndf.to_hdf('test.h5','df',data_columns=['A','B'],mode='w',table=True, index=True)",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "1 loops, best of 3: 8.36 s per loop\n"
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "#### Note, how much faster it becomes without creating an index!"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "%%timeit\ndf.to_hdf('test.h5','df',data_columns=['A','B'],mode='w',table=True, index=False)",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "1 loops, best of 3: 1.28 s per loop\n"
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "store = getstore_and_print_table('test.h5')",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "/df/table (Table(1000000,)) ''\n  description := {\n  \"index\": Int64Col(shape=(), dflt=0, pos=0),\n  \"A\": Float64Col(shape=(), dflt=0.0, pos=1),\n  \"B\": Float64Col(shape=(), dflt=0.0, pos=2)}\n  byteorder := 'little'\n  chunkshape := (5461,)\n"
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "#### The store object still has indexers, I don't really understand how or why. I guess I really should read some pytables intro, sorry Jeff! :-)"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "store",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 7,
       "text": "<class 'pandas.io.pytables.HDFStore'>\nFile path: test.h5\n/df            frame_table  (typ->appendable,nrows->1000000,ncols->2,indexers->[index],dc->[A,B])"
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "#### The following %timeit result is independent of having data_columns=list('AB') or data_columns=['B'].\nAlso, that the selection works must have something to do with the fact that it has data_columns, even so I created the table with index=False\n"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "%timeit store.select('df',['B > 0.5', 'B < 1.6'])",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "1 loops, best of 3: 1.02 s per loop\n"
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "%timeit store.select('df',['A<0.5','A>0.0'])",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "1 loops, best of 3: 641 ms per loop\n"
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "#### Here I tested if it makes any difference in creating a optlevel 9 index. I found it negligible. The only important thing is, that this index HAS to be 'full', otherwise ptrepack --sortby will bail."
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "%timeit store.create_table_index('df',columns=['B'],kind='full')",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "1 loops, best of 3: 613 \u00b5s per loop\n"
      }
     ],
     "prompt_number": 10
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "store.get_storer('df').group.table",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 11,
       "text": "/df/table (Table(1000000,)) ''\n  description := {\n  \"index\": Int64Col(shape=(), dflt=0, pos=0),\n  \"A\": Float64Col(shape=(), dflt=0.0, pos=1),\n  \"B\": Float64Col(shape=(), dflt=0.0, pos=2)}\n  byteorder := 'little'\n  chunkshape := (5461,)\n  autoIndex := True\n  colindexes := {\n    \"B\": Index(6, full, shuffle, zlib(1)).is_CSI=True}"
      }
     ],
     "prompt_number": 11
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "No improvement just by creating an index. IIUC, that's because data_columns, that had been miracously created, even so I saved with index=False, had created an index automatically, just not 'full' as required by ptrepack --sortby:"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "%timeit store.select('df',['B > 0.5', 'B < 1.6'])",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "1 loops, best of 3: 1.05 s per loop\n"
      }
     ],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "%timeit store.select('df',['A<0.5','A>0.0'])",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "1 loops, best of 3: 643 ms per loop\n"
      }
     ],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "store.close()",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "!ptdump -v test.h5",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "/ (RootGroup) ''\r\n/df (Group) ''\r\n/df/table (Table(1000000,)) ''\r\n  description := {\r\n  \"index\": Int64Col(shape=(), dflt=0, pos=0),\r\n  \"A\": Float64Col(shape=(), dflt=0.0, pos=1),\r\n  \"B\": Float64Col(shape=(), dflt=0.0, pos=2)}\r\n  byteorder := 'little'\r\n  chunkshape := (5461,)\r\n  autoIndex := True\r\n  colindexes := {\r\n    \"B\": Index(6, full, shuffle, zlib(1)).is_CSI=True}\r\n"
      }
     ],
     "prompt_number": 15
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": "The ptrepack works with a standard Index level 6 as well, the only requirement I found is, that it absolutely has to be a 'full' index, as created above."
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "%timeit !ptrepack --chunkshape=auto --sortby=B -o test.h5 test_sorted_noprop.h5",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "1 loops, best of 3: 4.29 s per loop\n"
      }
     ],
     "prompt_number": 28
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": "See how the lack of --propindex removes the autoindex=True and colindexes part again, as it was before creating the index above in the storer().group.table print-out:"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "!ptdump -v test_sorted_noprop.h5",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "/ (RootGroup) ''\r\n/df (Group) ''\r\n/df/table (Table(1000000,)) ''\r\n  description := {\r\n  \"index\": Int64Col(shape=(), dflt=0, pos=0),\r\n  \"A\": Float64Col(shape=(), dflt=0.0, pos=1),\r\n  \"B\": Float64Col(shape=(), dflt=0.0, pos=2)}\r\n  byteorder := 'little'\r\n  chunkshape := (5461,)\r\n"
      }
     ],
     "prompt_number": 17
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": "The output of get_storer('df').group.table is exactly the same as the ptdump:"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "store = getstore_and_print_table('test_sorted_noprop.h5')",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "/df/table (Table(1000000,)) ''\n  description := {\n  \"index\": Int64Col(shape=(), dflt=0, pos=0),\n  \"A\": Float64Col(shape=(), dflt=0.0, pos=1),\n  \"B\": Float64Col(shape=(), dflt=0.0, pos=2)}\n  byteorder := 'little'\n  chunkshape := (5461,)\n"
      }
     ],
     "prompt_number": 18
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "store",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 19,
       "text": "<class 'pandas.io.pytables.HDFStore'>\nFile path: test_sorted_noprop.h5\n/df            frame_table  (typ->appendable,nrows->1000000,ncols->2,indexers->[index],dc->[A,B])"
      }
     ],
     "prompt_number": 19
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "#### But the timing of the selection has improved by factor 5!"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "%timeit store.select('df',['B > 0.5', 'B < 1.6'])",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "1 loops, best of 3: 188 ms per loop\n"
      }
     ],
     "prompt_number": 20
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": "And even with no --propindex the selection for column 'A' keeps feasible (which I wasn't sure before):"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "try:\n    %timeit store.select('df',['A<0.5','A>0.0'])\nexcept ValueError as e:\n    print \"ValueError:\",e",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "1 loops, best of 3: 626 ms per loop\n"
      }
     ],
     "prompt_number": 21
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "store.close()",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 22
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "#### The repacking with --propindex takes factor 7/4.3 longer, with a slightly faster data selection below:"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "%timeit !ptrepack --chunkshape=auto --sortby=B --propindexes -o test.h5 test_sorted.h5",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "1 loops, best of 3: 7.08 s per loop\n"
      }
     ],
     "prompt_number": 27
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "!ptdump -v test_sorted.h5",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "/ (RootGroup) ''\r\n/df (Group) ''\r\n/df/table (Table(1000000,)) ''\r\n  description := {\r\n  \"index\": Int64Col(shape=(), dflt=0, pos=0),\r\n  \"A\": Float64Col(shape=(), dflt=0.0, pos=1),\r\n  \"B\": Float64Col(shape=(), dflt=0.0, pos=2)}\r\n  byteorder := 'little'\r\n  chunkshape := (5461,)\r\n  autoIndex := True\r\n  colindexes := {\r\n    \"B\": Index(6, full, shuffle, zlib(1)).is_CSI=True}\r\n"
      }
     ],
     "prompt_number": 29
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "store = getstore_and_print_table('test_sorted.h5')",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "/df/table (Table(1000000,)) ''\n  description := {\n  \"index\": Int64Col(shape=(), dflt=0, pos=0),\n  \"A\": Float64Col(shape=(), dflt=0.0, pos=1),\n  \"B\": Float64Col(shape=(), dflt=0.0, pos=2)}\n  byteorder := 'little'\n  chunkshape := (5461,)\n  autoIndex := True\n  colindexes := {\n    \"B\": Index(6, full, shuffle, zlib(1)).is_CSI=True}\n"
      }
     ],
     "prompt_number": 30
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "%timeit store.select('df',['B > 0.5','B < 1.6'])",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "10 loops, best of 3: 172 ms per loop\n"
      }
     ],
     "prompt_number": 31
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "try:\n    %timeit store.select('df',['A<0.5','A>0.0'])\nexcept ValueError as e:\n    print \"ValueError:\",e",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "1 loops, best of 3: 627 ms per loop\n"
      }
     ],
     "prompt_number": 32
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "#### So one has to decide if one wants to spend the almost factor 2 longer ptrepack with the benefit of slightly faster selection."
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "store.close()",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 33
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "Compression (see next, here done at level 5, I also tested 9) doesn't make timing much worse, but certainly not better also. Interestingly, it only took marginally more than the ptrepack with --propindexes which means that that option is dominating the ptrepacking time.\nThe filesizes for my examples were:\n\n* test.h5 31 M\n* test_sorted_noprop.h5 24 M\n* test_sorted.h5 29 M\n* test_sorted_compressed.h5 22 M\n\nSo my conclusion is, that doing things without index at data collection saves a lot of time, and even quite some space, without even using compression!"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "%timeit !ptrepack --chunkshape=auto --sortby=B --propindexes --complib=blosc --complevel=5 -o test.h5 test_sorted_compressed.h5",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "1 loops, best of 3: 7.21 s per loop\n"
      }
     ],
     "prompt_number": 34
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "!ptdump -v test_sorted_compressed.h5",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "/ (RootGroup) ''\r\n/df (Group) ''\r\n/df/table (Table(1000000,), shuffle, blosc(5)) ''\r\n  description := {\r\n  \"index\": Int64Col(shape=(), dflt=0, pos=0),\r\n  \"A\": Float64Col(shape=(), dflt=0.0, pos=1),\r\n  \"B\": Float64Col(shape=(), dflt=0.0, pos=2)}\r\n  byteorder := 'little'\r\n  chunkshape := (5461,)\r\n  autoIndex := True\r\n  colindexes := {\r\n    \"B\": Index(6, full, shuffle, zlib(1)).is_CSI=True}\r\n"
      }
     ],
     "prompt_number": 35
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "store = getstore_and_print_table('test_sorted_compressed.h5')",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "/df/table (Table(1000000,), shuffle, blosc(5)) ''\n  description := {\n  \"index\": Int64Col(shape=(), dflt=0, pos=0),\n  \"A\": Float64Col(shape=(), dflt=0.0, pos=1),\n  \"B\": Float64Col(shape=(), dflt=0.0, pos=2)}\n  byteorder := 'little'\n  chunkshape := (5461,)\n  autoIndex := True\n  colindexes := {\n    \"B\": Index(6, full, shuffle, zlib(1)).is_CSI=True}\n"
      }
     ],
     "prompt_number": 36
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "%timeit store.select('df',['B > 0.5','B < 1.6'])",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "1 loops, best of 3: 191 ms per loop\n"
      }
     ],
     "prompt_number": 37
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "store.close()",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 38
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "",
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}
	{
	"metadata": {
	"name": "store tests"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "import pandas as pd",
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 1
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "def getstore_and_print_table(fname):\n import pprint\n store = pd.HDFStore(fname)\n pprint.pprint(store.get_storer('df').group.table)\n return store",
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 2
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "df = pd.DataFrame(randn(1e6,2),columns=list('AB'))",
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 3
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "%%timeit\ndf.to_hdf('test.h5','df',data_columns=['A','B'],mode='w',table=True, index=True)",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "1 loops, best of 3: 8.36 s per loop\n"
	}
	],
	"prompt_number": 4
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": "#### Note, how much faster it becomes without creating an index!"
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "%%timeit\ndf.to_hdf('test.h5','df',data_columns=['A','B'],mode='w',table=True, index=False)",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "1 loops, best of 3: 1.28 s per loop\n"
	}
	],
	"prompt_number": 5
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "store = getstore_and_print_table('test.h5')",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "/df/table (Table(1000000,)) ''\n description := {\n \"index\": Int64Col(shape=(), dflt=0, pos=0),\n \"A\": Float64Col(shape=(), dflt=0.0, pos=1),\n \"B\": Float64Col(shape=(), dflt=0.0, pos=2)}\n byteorder := 'little'\n chunkshape := (5461,)\n"
	}
	],
	"prompt_number": 6
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": "#### The store object still has indexers, I don't really understand how or why. I guess I really should read some pytables intro, sorry Jeff! :-)"
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "store",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 7,
	"text": "<class 'pandas.io.pytables.HDFStore'>\nFile path: test.h5\n/df frame_table (typ->appendable,nrows->1000000,ncols->2,indexers->[index],dc->[A,B])"
	}
	],
	"prompt_number": 7
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": "#### The following %timeit result is independent of having data_columns=list('AB') or data_columns=['B'].\nAlso, that the selection works must have something to do with the fact that it has data_columns, even so I created the table with index=False\n"
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "%timeit store.select('df',['B > 0.5', 'B < 1.6'])",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "1 loops, best of 3: 1.02 s per loop\n"
	}
	],
	"prompt_number": 8
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "%timeit store.select('df',['A<0.5','A>0.0'])",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "1 loops, best of 3: 641 ms per loop\n"
	}
	],
	"prompt_number": 9
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": "#### Here I tested if it makes any difference in creating a optlevel 9 index. I found it negligible. The only important thing is, that this index HAS to be 'full', otherwise ptrepack --sortby will bail."
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "%timeit store.create_table_index('df',columns=['B'],kind='full')",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "1 loops, best of 3: 613 \u00b5s per loop\n"
	}
	],
	"prompt_number": 10
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "store.get_storer('df').group.table",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 11,
	"text": "/df/table (Table(1000000,)) ''\n description := {\n \"index\": Int64Col(shape=(), dflt=0, pos=0),\n \"A\": Float64Col(shape=(), dflt=0.0, pos=1),\n \"B\": Float64Col(shape=(), dflt=0.0, pos=2)}\n byteorder := 'little'\n chunkshape := (5461,)\n autoIndex := True\n colindexes := {\n \"B\": Index(6, full, shuffle, zlib(1)).is_CSI=True}"
	}
	],
	"prompt_number": 11
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": "No improvement just by creating an index. IIUC, that's because data_columns, that had been miracously created, even so I saved with index=False, had created an index automatically, just not 'full' as required by ptrepack --sortby:"
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "%timeit store.select('df',['B > 0.5', 'B < 1.6'])",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "1 loops, best of 3: 1.05 s per loop\n"
	}
	],
	"prompt_number": 12
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "%timeit store.select('df',['A<0.5','A>0.0'])",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "1 loops, best of 3: 643 ms per loop\n"
	}
	],
	"prompt_number": 13
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "store.close()",
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 14
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "!ptdump -v test.h5",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "/ (RootGroup) ''\r\n/df (Group) ''\r\n/df/table (Table(1000000,)) ''\r\n description := {\r\n \"index\": Int64Col(shape=(), dflt=0, pos=0),\r\n \"A\": Float64Col(shape=(), dflt=0.0, pos=1),\r\n \"B\": Float64Col(shape=(), dflt=0.0, pos=2)}\r\n byteorder := 'little'\r\n chunkshape := (5461,)\r\n autoIndex := True\r\n colindexes := {\r\n \"B\": Index(6, full, shuffle, zlib(1)).is_CSI=True}\r\n"
	}
	],
	"prompt_number": 15
	},
	{
	"cell_type": "heading",
	"level": 4,
	"metadata": {},
	"source": "The ptrepack works with a standard Index level 6 as well, the only requirement I found is, that it absolutely has to be a 'full' index, as created above."
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "%timeit !ptrepack --chunkshape=auto --sortby=B -o test.h5 test_sorted_noprop.h5",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "1 loops, best of 3: 4.29 s per loop\n"
	}
	],
	"prompt_number": 28
	},
	{
	"cell_type": "heading",
	"level": 4,
	"metadata": {},
	"source": "See how the lack of --propindex removes the autoindex=True and colindexes part again, as it was before creating the index above in the storer().group.table print-out:"
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "!ptdump -v test_sorted_noprop.h5",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "/ (RootGroup) ''\r\n/df (Group) ''\r\n/df/table (Table(1000000,)) ''\r\n description := {\r\n \"index\": Int64Col(shape=(), dflt=0, pos=0),\r\n \"A\": Float64Col(shape=(), dflt=0.0, pos=1),\r\n \"B\": Float64Col(shape=(), dflt=0.0, pos=2)}\r\n byteorder := 'little'\r\n chunkshape := (5461,)\r\n"
	}
	],
	"prompt_number": 17
	},
	{
	"cell_type": "heading",
	"level": 4,
	"metadata": {},
	"source": "The output of get_storer('df').group.table is exactly the same as the ptdump:"
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "store = getstore_and_print_table('test_sorted_noprop.h5')",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "/df/table (Table(1000000,)) ''\n description := {\n \"index\": Int64Col(shape=(), dflt=0, pos=0),\n \"A\": Float64Col(shape=(), dflt=0.0, pos=1),\n \"B\": Float64Col(shape=(), dflt=0.0, pos=2)}\n byteorder := 'little'\n chunkshape := (5461,)\n"
	}
	],
	"prompt_number": 18
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "store",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 19,
	"text": "<class 'pandas.io.pytables.HDFStore'>\nFile path: test_sorted_noprop.h5\n/df frame_table (typ->appendable,nrows->1000000,ncols->2,indexers->[index],dc->[A,B])"
	}
	],
	"prompt_number": 19
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": "#### But the timing of the selection has improved by factor 5!"
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "%timeit store.select('df',['B > 0.5', 'B < 1.6'])",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "1 loops, best of 3: 188 ms per loop\n"
	}
	],
	"prompt_number": 20
	},
	{
	"cell_type": "heading",
	"level": 4,
	"metadata": {},
	"source": "And even with no --propindex the selection for column 'A' keeps feasible (which I wasn't sure before):"
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "try:\n %timeit store.select('df',['A<0.5','A>0.0'])\nexcept ValueError as e:\n print \"ValueError:\",e",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "1 loops, best of 3: 626 ms per loop\n"
	}
	],
	"prompt_number": 21
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "store.close()",
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 22
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": "#### The repacking with --propindex takes factor 7/4.3 longer, with a slightly faster data selection below:"
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "%timeit !ptrepack --chunkshape=auto --sortby=B --propindexes -o test.h5 test_sorted.h5",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "1 loops, best of 3: 7.08 s per loop\n"
	}
	],
	"prompt_number": 27
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "!ptdump -v test_sorted.h5",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "/ (RootGroup) ''\r\n/df (Group) ''\r\n/df/table (Table(1000000,)) ''\r\n description := {\r\n \"index\": Int64Col(shape=(), dflt=0, pos=0),\r\n \"A\": Float64Col(shape=(), dflt=0.0, pos=1),\r\n \"B\": Float64Col(shape=(), dflt=0.0, pos=2)}\r\n byteorder := 'little'\r\n chunkshape := (5461,)\r\n autoIndex := True\r\n colindexes := {\r\n \"B\": Index(6, full, shuffle, zlib(1)).is_CSI=True}\r\n"
	}
	],
	"prompt_number": 29
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "store = getstore_and_print_table('test_sorted.h5')",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "/df/table (Table(1000000,)) ''\n description := {\n \"index\": Int64Col(shape=(), dflt=0, pos=0),\n \"A\": Float64Col(shape=(), dflt=0.0, pos=1),\n \"B\": Float64Col(shape=(), dflt=0.0, pos=2)}\n byteorder := 'little'\n chunkshape := (5461,)\n autoIndex := True\n colindexes := {\n \"B\": Index(6, full, shuffle, zlib(1)).is_CSI=True}\n"
	}
	],
	"prompt_number": 30
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "%timeit store.select('df',['B > 0.5','B < 1.6'])",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "10 loops, best of 3: 172 ms per loop\n"
	}
	],
	"prompt_number": 31
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "try:\n %timeit store.select('df',['A<0.5','A>0.0'])\nexcept ValueError as e:\n print \"ValueError:\",e",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "1 loops, best of 3: 627 ms per loop\n"
	}
	],
	"prompt_number": 32
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": "#### So one has to decide if one wants to spend the almost factor 2 longer ptrepack with the benefit of slightly faster selection."
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "store.close()",
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 33
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": "Compression (see next, here done at level 5, I also tested 9) doesn't make timing much worse, but certainly not better also. Interestingly, it only took marginally more than the ptrepack with --propindexes which means that that option is dominating the ptrepacking time.\nThe filesizes for my examples were:\n\n* test.h5 31 M\n* test_sorted_noprop.h5 24 M\n* test_sorted.h5 29 M\n* test_sorted_compressed.h5 22 M\n\nSo my conclusion is, that doing things without index at data collection saves a lot of time, and even quite some space, without even using compression!"
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "%timeit !ptrepack --chunkshape=auto --sortby=B --propindexes --complib=blosc --complevel=5 -o test.h5 test_sorted_compressed.h5",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "1 loops, best of 3: 7.21 s per loop\n"
	}
	],
	"prompt_number": 34
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "!ptdump -v test_sorted_compressed.h5",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "/ (RootGroup) ''\r\n/df (Group) ''\r\n/df/table (Table(1000000,), shuffle, blosc(5)) ''\r\n description := {\r\n \"index\": Int64Col(shape=(), dflt=0, pos=0),\r\n \"A\": Float64Col(shape=(), dflt=0.0, pos=1),\r\n \"B\": Float64Col(shape=(), dflt=0.0, pos=2)}\r\n byteorder := 'little'\r\n chunkshape := (5461,)\r\n autoIndex := True\r\n colindexes := {\r\n \"B\": Index(6, full, shuffle, zlib(1)).is_CSI=True}\r\n"
	}
	],
	"prompt_number": 35
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "store = getstore_and_print_table('test_sorted_compressed.h5')",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "/df/table (Table(1000000,), shuffle, blosc(5)) ''\n description := {\n \"index\": Int64Col(shape=(), dflt=0, pos=0),\n \"A\": Float64Col(shape=(), dflt=0.0, pos=1),\n \"B\": Float64Col(shape=(), dflt=0.0, pos=2)}\n byteorder := 'little'\n chunkshape := (5461,)\n autoIndex := True\n colindexes := {\n \"B\": Index(6, full, shuffle, zlib(1)).is_CSI=True}\n"
	}
	],
	"prompt_number": 36
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "%timeit store.select('df',['B > 0.5','B < 1.6'])",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "1 loops, best of 3: 191 ms per loop\n"
	}
	],
	"prompt_number": 37
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "store.close()",
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 38
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "",
	"language": "python",
	"metadata": {},
	"outputs": []
	}
	],
	"metadata": {}
	}
	]
	}