Skip to content

Instantly share code, notes, and snippets.

@rossant
Last active August 29, 2015 13:57
Show Gist options
  • Save rossant/9467085 to your computer and use it in GitHub Desktop.
Save rossant/9467085 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:b57688628ab9dced9a7a1cd4ca63a4ecbbb88f993ce65bc2e0a4f5cf9fe4e278"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Slicing huge arrays in h5py"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's import h5py and NumPy."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import h5py\n",
"import numpy as np\n",
"import os"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Chunked array"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"n, k = 1000000, 500"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"if not os.path.exists('test_ind'):\n",
" # We create the file.\n",
" with h5py.File(\"test_ind\", \"w\") as f:\n",
" a = f.create_dataset('/test', shape=(n, k),\n",
" chunks=(100, k))\n",
" # We fill the array progressively.\n",
" for i in range(10):\n",
" print i,\n",
" a[i*(n//10):(i+1)*(n//10),...] = np.random.rand((n//10), k)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"0 "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1 "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"2 "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"3 "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"4 "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"5 "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"6 "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"7 "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"8 "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"9\n"
]
}
],
"prompt_number": 12
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now, let's do some benchmarks."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"ind = np.random.randint(size=1000, low=0, high=n)\n",
"ind = np.sort(ind)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"with h5py.File(\"test_ind\", \"r\") as f:\n",
" a = f['/test']\n",
" \n",
" %timeit -r1 -n1 [a[i,...] for i in ind]\n",
" %timeit [a[i,...] for i in ind]\n",
" %timeit -r1 -n1 [a[i:i+1,...] for i in ind]\n",
" %timeit [a[i:i+1,...] for i in ind]\n",
" #%timeit -r1 -n1 a[:,...]\n",
" #%timeit -r1 -n1 a[:,...][ind,...]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1 loops, best of 1: 198 ms per loop\n",
"10 loops, best of 3: 180 ms per loop"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"1 loops, best of 1: 177 ms per loop"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"10 loops, best of 3: 177 ms per loop"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 16
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Contiguous array"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's try with a contiguous array."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"if not os.path.exists('test_ind_contiguous'):\n",
" # We create the file.\n",
" with h5py.File(\"test_ind_contiguous\", \"w\") as f:\n",
" a = f.create_dataset('/test', shape=(n, k),)\n",
" # We fill the array progressively.\n",
" for i in range(10):\n",
" print i,\n",
" a[i*(n//10):(i+1)*(n//10),...] = np.random.rand((n//10), k)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"with h5py.File(\"test_ind_contiguous\", \"r\") as f:\n",
" a = f['/test']\n",
" \n",
" %timeit -r1 -n1 [a[i,...] for i in ind]\n",
" %timeit [a[i,...] for i in ind]\n",
" %timeit -r1 -n1 [a[i:i+1,...] for i in ind]\n",
" %timeit [a[i:i+1,...] for i in ind]\n",
" #%timeit -r1 -n1 a[:,...]\n",
" #%timeit -r1 -n1 a[:,...][ind,...]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1 loops, best of 1: 159 ms per loop\n",
"10 loops, best of 3: 149 ms per loop"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"1 loops, best of 1: 147 ms per loop"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"10 loops, best of 3: 141 ms per loop"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 9
}
],
"metadata": {}
}
]
}
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment