kain88-de/speed benchmark.ipynb

## speed benchmark.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Speed testing persistent offsets\n",
    "Trying different mechanisms to serialize a dict data structure with a large numpy array included.\n",
    "\n",
    "THIS notebook was run on a Macbook Pro Mac OS X 10.6.8 with a Core Duo 2.6 GHz and a SSD disk."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data = {'offsets': np.sort(np.random.uniform(high=2e8, size=int(2e6))),\n",
    "        'ctime': 123456789,\n",
    "        'size': 987654321}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#  Pickle Speed test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import cPickle"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Default pickle "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open('test-pickle.pkl', 'wb') as f:\n",
    "    cPickle.dump(data, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-rw-r--r-- 1 max max 42M Dec 17 17:53 test-pickle.pkl\r\n"
     ]
    }
   ],
   "source": [
    "!ls -lh test-pickle.pkl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fp_pickle = open('test-pickle.pkl', 'rb')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 loops, best of 3: 1.66 s per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit cPickle.load(fp_pickle); fp_pickle.seek(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fp_pickle.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Highest protocol "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open('test-pickle-fast.pkl', 'wb') as f:\n",
    "    cPickle.dump(data, f, protocol=cPickle.HIGHEST_PROTOCOL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-rw-r--r-- 1 max max 16M Dec 17 17:53 test-pickle-fast.pkl\r\n"
     ]
    }
   ],
   "source": [
    "!ls -lh test-pickle-fast.pkl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fp_pickle = open('test-pickle-fast.pkl', 'rb')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100 loops, best of 3: 17.8 ms per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit cPickle.load(fp_pickle); fp_pickle.seek(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fp_pickle.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using the `HIGHEST_PROTOCOL` is important, the file size is ~1/3 and the loading speed up is"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "107.59493670886074"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "1.7 / 15.8e-3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# JSON Speed test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data_json = data.copy()\n",
    "data_json['offsets'] = list(data_json['offsets'])\n",
    "\n",
    "with open('test-json.json', 'w') as f:\n",
    "    json.dump(data_json, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-rw-r--r-- 1 max max 38M Dec 17 17:53 test-json.json\r\n"
     ]
    }
   ],
   "source": [
    "!ls -lh test-json.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fp_json = open('test-json.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 loops, best of 3: 590 ms per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit json.load(fp_json); fp_json.seek(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fp_json.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Marshal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import marshal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open('test-marshal.marsh', 'w') as f:\n",
    "    marshal.dump(data, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-rw-r--r-- 1 max max 16M Dec 17 17:53 test-marshal.marsh\r\n"
     ]
    }
   ],
   "source": [
    "!ls -lh test-marshal.marsh"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fp_marsh = open('test-marshal.marsh')\n",
    "#fp_marsh.close()\n",
    "d = marshal.load(fp_marsh)\n",
    "o=d['offsets']\n",
    "c=d['ctime']\n",
    "s=d['size']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100 loops, best of 3: 2.75 ms per loop\n"
     ]
    }
   ],
   "source": [
    "%%timeit fp = open('test-marshal.marsh')\n",
    "fp.seek(0)\n",
    "marshal.load(fp)\n",
    "o=d['offsets']\n",
    "c=d['ctime']\n",
    "s=d['size']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fp_marsh.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## numpy native\n",
    "@mnmelo suggested using `numpy.savez` (or `numpy.savez_compressed`): \n",
    "\n",
    "We can also save the offsets directly as a numpy array. I'm not sure if this entails endianness problems, but might be faster, since it's a method native to the object.\n",
    "Since we also have to write out filesize and modification time we can create a second array with these two values and save the whole thing as a packed set of numpy arrays (using `numpy.savez`; and representing the modification time as a long int)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### np.savez (uncompressed) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "np.savez(\"test-savez.npz\", offsets=data['offsets'], \n",
    "         size=data['size'], ctime=data['ctime'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-rw-r--r-- 1 max max 16M Dec 17 17:53 test-savez.npz\r\n"
     ]
    }
   ],
   "source": [
    "!ls -lh test-savez.npz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fp_savez = open(\"test-savez.npz\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100 loops, best of 3: 18.3 ms per loop\n"
     ]
    }
   ],
   "source": [
    "%%timeit fp_savez = open('test-savez.npz')\n",
    "d=np.load(fp_savez)\n",
    "o=d['offsets']\n",
    "c=d['ctime']\n",
    "s=d['size']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_loaded = np.load(fp_savez)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(2000000,)\n",
      "123456789\n"
     ]
    }
   ],
   "source": [
    "print(data_loaded['offsets'].shape)\n",
    "print(data_loaded['ctime'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['size', 'ctime', 'offsets']"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_loaded.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fp_savez.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "File size is as small as all the other good solutions. Loading speed compared to `cPickle.HIGHEST_PROTOCOL`: speed up (worst case estimate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6343.851688210885"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "1.7/(8.17 * 32.8e-6)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### np.savez_compressed "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "np.savez_compressed(\"test-savez-compressed.npz\", offsets=data['offsets'], \n",
    "         size=data['size'], ctime=data['ctime'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-rw-r--r-- 1 max max 11M Dec 14 11:39 test-savez-compressed.npz\r\n"
     ]
    }
   ],
   "source": [
    "!ls -lh test-savez-compressed.npz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fp_savez = open(\"test-savez-compressed.npz\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The slowest run took 7.88 times longer than the fastest. This could mean that an intermediate result is being cached \n",
      "10000 loops, best of 3: 33.5 µs per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit np.load(fp_savez); fp_savez.seek(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_loaded = np.load(fp_savez)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(2000000,)\n",
      "123456789\n"
     ]
    }
   ],
   "source": [
    "print(data_loaded['offsets'].shape)\n",
    "print(data_loaded['ctime'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fp_savez.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Speed testing persistent offsets\n",
	"Trying different mechanisms to serialize a dict data structure with a large numpy array included.\n",
	"\n",
	"THIS notebook was run on a Macbook Pro Mac OS X 10.6.8 with a Core Duo 2.6 GHz and a SSD disk."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"data = {'offsets': np.sort(np.random.uniform(high=2e8, size=int(2e6))),\n",
	" 'ctime': 123456789,\n",
	" 'size': 987654321}"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Pickle Speed test"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import cPickle"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Default pickle "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"with open('test-pickle.pkl', 'wb') as f:\n",
	" cPickle.dump(data, f)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"-rw-r--r-- 1 max max 42M Dec 17 17:53 test-pickle.pkl\r\n"
	]
	}
	],
	"source": [
	"!ls -lh test-pickle.pkl"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"fp_pickle = open('test-pickle.pkl', 'rb')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1 loops, best of 3: 1.66 s per loop\n"
	]
	}
	],
	"source": [
	"%timeit cPickle.load(fp_pickle); fp_pickle.seek(0)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"fp_pickle.close()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Highest protocol "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"with open('test-pickle-fast.pkl', 'wb') as f:\n",
	" cPickle.dump(data, f, protocol=cPickle.HIGHEST_PROTOCOL)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"-rw-r--r-- 1 max max 16M Dec 17 17:53 test-pickle-fast.pkl\r\n"
	]
	}
	],
	"source": [
	"!ls -lh test-pickle-fast.pkl"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"fp_pickle = open('test-pickle-fast.pkl', 'rb')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"100 loops, best of 3: 17.8 ms per loop\n"
	]
	}
	],
	"source": [
	"%timeit cPickle.load(fp_pickle); fp_pickle.seek(0)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"fp_pickle.close()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Using the `HIGHEST_PROTOCOL` is important, the file size is ~1/3 and the loading speed up is"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"107.59493670886074"
	]
	},
	"execution_count": 14,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"1.7 / 15.8e-3"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# JSON Speed test"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import json"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"data_json = data.copy()\n",
	"data_json['offsets'] = list(data_json['offsets'])\n",
	"\n",
	"with open('test-json.json', 'w') as f:\n",
	" json.dump(data_json, f)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"-rw-r--r-- 1 max max 38M Dec 17 17:53 test-json.json\r\n"
	]
	}
	],
	"source": [
	"!ls -lh test-json.json"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"fp_json = open('test-json.json')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1 loops, best of 3: 590 ms per loop\n"
	]
	}
	],
	"source": [
	"%timeit json.load(fp_json); fp_json.seek(0)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"fp_json.close()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Marshal"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import marshal"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"with open('test-marshal.marsh', 'w') as f:\n",
	" marshal.dump(data, f)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"-rw-r--r-- 1 max max 16M Dec 17 17:53 test-marshal.marsh\r\n"
	]
	}
	],
	"source": [
	"!ls -lh test-marshal.marsh"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 37,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"fp_marsh = open('test-marshal.marsh')\n",
	"#fp_marsh.close()\n",
	"d = marshal.load(fp_marsh)\n",
	"o=d['offsets']\n",
	"c=d['ctime']\n",
	"s=d['size']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 44,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"100 loops, best of 3: 2.75 ms per loop\n"
	]
	}
	],
	"source": [
	"%%timeit fp = open('test-marshal.marsh')\n",
	"fp.seek(0)\n",
	"marshal.load(fp)\n",
	"o=d['offsets']\n",
	"c=d['ctime']\n",
	"s=d['size']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"fp_marsh.close()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"## numpy native\n",
	"@mnmelo suggested using `numpy.savez` (or `numpy.savez_compressed`): \n",
	"\n",
	"We can also save the offsets directly as a numpy array. I'm not sure if this entails endianness problems, but might be faster, since it's a method native to the object.\n",
	"Since we also have to write out filesize and modification time we can create a second array with these two values and save the whole thing as a packed set of numpy arrays (using `numpy.savez`; and representing the modification time as a long int)."
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### np.savez (uncompressed) "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"np.savez(\"test-savez.npz\", offsets=data['offsets'], \n",
	" size=data['size'], ctime=data['ctime'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"-rw-r--r-- 1 max max 16M Dec 17 17:53 test-savez.npz\r\n"
	]
	}
	],
	"source": [
	"!ls -lh test-savez.npz"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"fp_savez = open(\"test-savez.npz\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"100 loops, best of 3: 18.3 ms per loop\n"
	]
	}
	],
	"source": [
	"%%timeit fp_savez = open('test-savez.npz')\n",
	"d=np.load(fp_savez)\n",
	"o=d['offsets']\n",
	"c=d['ctime']\n",
	"s=d['size']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"data_loaded = np.load(fp_savez)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"(2000000,)\n",
	"123456789\n"
	]
	}
	],
	"source": [
	"print(data_loaded['offsets'].shape)\n",
	"print(data_loaded['ctime'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['size', 'ctime', 'offsets']"
	]
	},
	"execution_count": 33,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"data_loaded.keys()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 34,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"fp_savez.close()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"File size is as small as all the other good solutions. Loading speed compared to `cPickle.HIGHEST_PROTOCOL`: speed up (worst case estimate)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 35,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"6343.851688210885"
	]
	},
	"execution_count": 35,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"1.7/(8.17 * 32.8e-6)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### np.savez_compressed "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 43,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"np.savez_compressed(\"test-savez-compressed.npz\", offsets=data['offsets'], \n",
	" size=data['size'], ctime=data['ctime'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 44,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"-rw-r--r-- 1 max max 11M Dec 14 11:39 test-savez-compressed.npz\r\n"
	]
	}
	],
	"source": [
	"!ls -lh test-savez-compressed.npz"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 45,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"fp_savez = open(\"test-savez-compressed.npz\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 46,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The slowest run took 7.88 times longer than the fastest. This could mean that an intermediate result is being cached \n",
	"10000 loops, best of 3: 33.5 µs per loop\n"
	]
	}
	],
	"source": [
	"%timeit np.load(fp_savez); fp_savez.seek(0)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 47,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"data_loaded = np.load(fp_savez)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 48,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"(2000000,)\n",
	"123456789\n"
	]
	}
	],
	"source": [
	"print(data_loaded['offsets'].shape)\n",
	"print(data_loaded['ctime'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 49,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"fp_savez.close()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.11"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}