-
-
Save orbeckst/2d2e8c12288b202a6a02 to your computer and use it in GitHub Desktop.
offset fileformat speed benchmark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Speed testing persistent offsets\n", | |
"Trying different mechanisms to serialize a dict data structure with a large numpy array included.\n", | |
"\n", | |
"THIS notebook was run on a Macbook Pro Mac OS X 10.6.8 with a Core Duo 2.6 GHz and a SSD disk." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"data = {'offsets': np.sort(np.random.uniform(high=2e8, size=int(2e6))),\n", | |
" 'ctime': 123456789,\n", | |
" 'size': 987654321}" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Pickle Speed test" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import cPickle" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Default pickle " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"with open('test-pickle.pkl', 'wb') as f:\n", | |
" cPickle.dump(data, f)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"-rw-r--r-- 1 oliver staff 41M 13 Dec 11:28 test-pickle.pkl\r\n" | |
] | |
} | |
], | |
"source": [ | |
"!ls -lh test-pickle.pkl" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"fp_pickle = open('test-pickle.pkl', 'rb')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loops, best of 3: 6.43 s per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit cPickle.load(fp_pickle); fp_pickle.seek(0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"fp_pickle.close()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Highest protocol " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"with open('test-pickle-fast.pkl', 'wb') as f:\n", | |
" cPickle.dump(data, f, protocol=cPickle.HIGHEST_PROTOCOL)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"-rw-r--r-- 1 oliver staff 15M 13 Dec 11:33 test-pickle-fast.pkl\r\n" | |
] | |
} | |
], | |
"source": [ | |
"!ls -lh test-pickle-fast.pkl" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"fp_pickle = open('test-pickle-fast.pkl', 'rb')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10 loops, best of 3: 39.7 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit cPickle.load(fp_pickle); fp_pickle.seek(0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"fp_pickle.close()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Using the `HIGHEST_PROTOCOL` is important, the file size is ~1/3 and the loading speed up is" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 65, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"161.9647355163728" | |
] | |
}, | |
"execution_count": 65, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"6.43 / 39.7e-3" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# JSON Speed test" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import json" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"data_json = data.copy()\n", | |
"data_json['offsets'] = list(data_json['offsets'])\n", | |
"\n", | |
"with open('test-json.json', 'w') as f:\n", | |
" json.dump(data_json, f)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"-rw-r--r-- 1 oliver staff 38M 13 Dec 11:33 test-json.json\r\n" | |
] | |
} | |
], | |
"source": [ | |
"!ls -lh test-json.json" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"fp_json = open('test-json.json')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loops, best of 3: 1.05 s per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit json.load(fp_json); fp_json.seek(0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"fp_json.close()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Marshal" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import marshal" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"with open('test-marshal.marsh', 'w') as f:\n", | |
" marshal.dump(data, f)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"-rw-r--r-- 1 oliver staff 15M 13 Dec 11:34 test-marshal.marsh\r\n" | |
] | |
} | |
], | |
"source": [ | |
"!ls -lh test-marshal.marsh" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"fp_marsh = open('test-marshal.marsh')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"100 loops, best of 3: 17.6 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit marshal.load(fp_marsh); fp_marsh.seek(0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"fp_marsh.close()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": true | |
}, | |
"source": [ | |
"## numpy native\n", | |
"@mnmelo suggested using `numpy.savez` (or `numpy.savez_compressed`): \n", | |
"\n", | |
"We can also save the offsets directly as a numpy array. I'm not sure if this entails endianness problems, but might be faster, since it's a method native to the object.\n", | |
"Since we also have to write out filesize and modification time we can create a second array with these two values and save the whole thing as a packed set of numpy arrays (using `numpy.savez`; and representing the modification time as a long int)." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### np.savez (uncompressed) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"np.savez(\"test-savez.npz\", offsets=data['offsets'], \n", | |
" size=data['size'], ctime=data['ctime'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 51, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"-rw-r--r-- 1 oliver staff 15M 13 Dec 11:48 test-savez.npz\r\n" | |
] | |
} | |
], | |
"source": [ | |
"!ls -lh test-savez.npz" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 52, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"fp_savez = open(\"test-savez.npz\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The slowest run took 6.25 times longer than the fastest. This could mean that an intermediate result is being cached \n", | |
"10000 loops, best of 3: 126 µs per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit np.load(fp_savez); fp_savez.seek(0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"data_loaded = np.load(fp_savez)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(2000000,)\n", | |
"123456789\n" | |
] | |
} | |
], | |
"source": [ | |
"print(data_loaded['offsets'].shape)\n", | |
"print(data_loaded['ctime'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"fp_savez.close()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"File size is as small as all the other good solutions. Loading speed compared to `cPickle.HIGHEST_PROTOCOL`: speed up (worst case estimate)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 66, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"50.41269841269841" | |
] | |
}, | |
"execution_count": 66, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"39.7e-3/(6.25 * 126e-6)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### np.savez_compressed " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 56, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"np.savez(\"test-savez-compressed.npz\", offsets=data['offsets'], \n", | |
" size=data['size'], ctime=data['ctime'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"-rw-r--r-- 1 oliver staff 15M 13 Dec 11:50 test-savez-compressed.npz\r\n" | |
] | |
} | |
], | |
"source": [ | |
"!ls -lh test-savez-compressed.npz" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 58, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"fp_savez = open(\"test-savez-compressed.npz\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 59, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The slowest run took 6.47 times longer than the fastest. This could mean that an intermediate result is being cached \n", | |
"10000 loops, best of 3: 137 µs per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit np.load(fp_savez); fp_savez.seek(0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 60, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"data_loaded = np.load(fp_savez)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 61, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(2000000,)\n", | |
"123456789\n" | |
] | |
} | |
], | |
"source": [ | |
"print(data_loaded['offsets'].shape)\n", | |
"print(data_loaded['ctime'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 62, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"fp_savez.close()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment