Skip to content

Instantly share code, notes, and snippets.

@orbeckst
Forked from kain88-de/speed benchmark.ipynb
Last active December 13, 2015 19:05
Show Gist options
  • Save orbeckst/2d2e8c12288b202a6a02 to your computer and use it in GitHub Desktop.
Save orbeckst/2d2e8c12288b202a6a02 to your computer and use it in GitHub Desktop.
offset fileformat speed benchmark
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Speed testing persistent offsets\n",
"Trying different mechanisms to serialize a dict data structure with a large numpy array included.\n",
"\n",
"THIS notebook was run on a Macbook Pro Mac OS X 10.6.8 with a Core Duo 2.6 GHz and a SSD disk."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data = {'offsets': np.sort(np.random.uniform(high=2e8, size=int(2e6))),\n",
" 'ctime': 123456789,\n",
" 'size': 987654321}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pickle Speed test"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import cPickle"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Default pickle "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"with open('test-pickle.pkl', 'wb') as f:\n",
" cPickle.dump(data, f)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-rw-r--r-- 1 oliver staff 41M 13 Dec 11:28 test-pickle.pkl\r\n"
]
}
],
"source": [
"!ls -lh test-pickle.pkl"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"fp_pickle = open('test-pickle.pkl', 'rb')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 loops, best of 3: 6.43 s per loop\n"
]
}
],
"source": [
"%timeit cPickle.load(fp_pickle); fp_pickle.seek(0)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"fp_pickle.close()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Highest protocol "
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"with open('test-pickle-fast.pkl', 'wb') as f:\n",
" cPickle.dump(data, f, protocol=cPickle.HIGHEST_PROTOCOL)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-rw-r--r-- 1 oliver staff 15M 13 Dec 11:33 test-pickle-fast.pkl\r\n"
]
}
],
"source": [
"!ls -lh test-pickle-fast.pkl"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"fp_pickle = open('test-pickle-fast.pkl', 'rb')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10 loops, best of 3: 39.7 ms per loop\n"
]
}
],
"source": [
"%timeit cPickle.load(fp_pickle); fp_pickle.seek(0)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"fp_pickle.close()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Using the `HIGHEST_PROTOCOL` is important, the file size is ~1/3 and the loading speed up is"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"161.9647355163728"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"6.43 / 39.7e-3"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# JSON Speed test"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import json"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data_json = data.copy()\n",
"data_json['offsets'] = list(data_json['offsets'])\n",
"\n",
"with open('test-json.json', 'w') as f:\n",
" json.dump(data_json, f)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-rw-r--r-- 1 oliver staff 38M 13 Dec 11:33 test-json.json\r\n"
]
}
],
"source": [
"!ls -lh test-json.json"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"fp_json = open('test-json.json')"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 loops, best of 3: 1.05 s per loop\n"
]
}
],
"source": [
"%timeit json.load(fp_json); fp_json.seek(0)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"fp_json.close()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Marshal"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import marshal"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"with open('test-marshal.marsh', 'w') as f:\n",
" marshal.dump(data, f)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-rw-r--r-- 1 oliver staff 15M 13 Dec 11:34 test-marshal.marsh\r\n"
]
}
],
"source": [
"!ls -lh test-marshal.marsh"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"fp_marsh = open('test-marshal.marsh')"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100 loops, best of 3: 17.6 ms per loop\n"
]
}
],
"source": [
"%timeit marshal.load(fp_marsh); fp_marsh.seek(0)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"fp_marsh.close()"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"## numpy native\n",
"@mnmelo suggested using `numpy.savez` (or `numpy.savez_compressed`): \n",
"\n",
"We can also save the offsets directly as a numpy array. I'm not sure if this entails endianness problems, but might be faster, since it's a method native to the object.\n",
"Since we also have to write out filesize and modification time we can create a second array with these two values and save the whole thing as a packed set of numpy arrays (using `numpy.savez`; and representing the modification time as a long int)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### np.savez (uncompressed) "
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"np.savez(\"test-savez.npz\", offsets=data['offsets'], \n",
" size=data['size'], ctime=data['ctime'])"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-rw-r--r-- 1 oliver staff 15M 13 Dec 11:48 test-savez.npz\r\n"
]
}
],
"source": [
"!ls -lh test-savez.npz"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"fp_savez = open(\"test-savez.npz\")"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The slowest run took 6.25 times longer than the fastest. This could mean that an intermediate result is being cached \n",
"10000 loops, best of 3: 126 µs per loop\n"
]
}
],
"source": [
"%timeit np.load(fp_savez); fp_savez.seek(0)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_loaded = np.load(fp_savez)"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(2000000,)\n",
"123456789\n"
]
}
],
"source": [
"print(data_loaded['offsets'].shape)\n",
"print(data_loaded['ctime'])"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"fp_savez.close()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"File size is as small as all the other good solutions. Loading speed compared to `cPickle.HIGHEST_PROTOCOL`: speed up (worst case estimate)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"50.41269841269841"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"39.7e-3/(6.25 * 126e-6)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### np.savez_compressed "
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"np.savez(\"test-savez-compressed.npz\", offsets=data['offsets'], \n",
" size=data['size'], ctime=data['ctime'])"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-rw-r--r-- 1 oliver staff 15M 13 Dec 11:50 test-savez-compressed.npz\r\n"
]
}
],
"source": [
"!ls -lh test-savez-compressed.npz"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"fp_savez = open(\"test-savez-compressed.npz\")"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The slowest run took 6.47 times longer than the fastest. This could mean that an intermediate result is being cached \n",
"10000 loops, best of 3: 137 µs per loop\n"
]
}
],
"source": [
"%timeit np.load(fp_savez); fp_savez.seek(0)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_loaded = np.load(fp_savez)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(2000000,)\n",
"123456789\n"
]
}
],
"source": [
"print(data_loaded['offsets'].shape)\n",
"print(data_loaded['ctime'])"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"fp_savez.close()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment