Skip to content

Instantly share code, notes, and snippets.

@monkeybutter
Created October 23, 2015 04:05
Show Gist options
  • Save monkeybutter/64ce34793f3e52c2f7a8 to your computer and use it in GitHub Desktop.
Save monkeybutter/64ce34793f3e52c2f7a8 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import glob\n",
"from random import randint\n",
"\n",
"def get_files(path):\n",
" path = \"{}*.h5\".format(path)\n",
" return glob.glob(path)\n",
"\n",
"#start & end both included\n",
"def get_rdm_list(number, start, end):\n",
" out = []\n",
" ready = False\n",
" \n",
" while not ready and len(out) < number:\n",
" n = randint(start, end)\n",
" if n not in out:\n",
" out.append(n)\n",
" \n",
" return out"
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 17.1 s, sys: 200 ms, total: 17.3 s\n",
"Wall time: 17.7 s\n"
]
}
],
"source": [
"import h5py\n",
"from osgeo import gdal\n",
"\n",
"#image is composed of 8 tile\n",
"def get_image(path):\n",
" paths = get_files(path)\n",
" f_idxs = get_rdm_list(8, 0, len(paths)-1)\n",
" \n",
" for f_idx in f_idxs:\n",
" with h5py.File(paths[f_idx], 'r') as h5f:\n",
" cube = h5f[\"NBAR\"][:, :, randint(0, 5)]\n",
"\n",
"\n",
"#image is composed of 4 tiles\n",
"def get_gdal_image(path):\n",
" paths = get_files(path)\n",
" f_idxs = get_rdm_list(8, 0, len(paths)-1)\n",
" \n",
" for f_idx in f_idxs:\n",
" image = gdal.Open(paths[f_idx])\n",
" for i in range(6):#band = randint(1, 6)\n",
" band_pack = image.GetRasterBand(i+1).ReadAsArray()\n",
" \n",
"\n",
"%time get_image('/g/data1/ep1_2/z00/prl900/hdf5_test/1A_LZF_C1/')"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Compose a one band image 8 tiles\n",
"\n",
"#0_geotiff 35GB\n",
"res_0_geotiff = [8.75, 6.91, 8.29, 6.06, 7.63, 7.01, 13.4, 10.7, 6.14, 7.95]\n",
"\n",
"#1A_hdf5 (4000, 4000, 6) 57GB\n",
"res_1A_hdf5 = [3.59, 3.61, 4.33, 3.61, 3.38, 3.65, 4.03, 3.68, 2.69, 3.23]\n",
"\n",
"#1B_hdf5 (6, 4000, 4000) 57GB\n",
"res_1B_hdf5 = [1.59, 1.32, 1.62, 1.35, 1.5, 1.57, 1.64, 1.66, 1.44, 1.3]\n",
"\n",
"#1A_LZF (4000, 4000, 6) 18GB\n",
"res_1A_LZF = [21.8, 26.9, 27.6, 27.6, 19.7, 22.7, 22.7, 24.6, 27.9, 22.2]\n",
"\n",
"#1A_GZIP (4000, 4000, 6) 13GB\n",
"res_1A_GZIP = [25.1, 26.8, 24.4, 26.4, 26.6, 26.1, 27.7, 25.2, 26.9, 21.7]\n",
"\n",
"#1B_LZF (6, 4000, 4000) 16GB\n",
"res_1B_LZF = [8.53, 12.2, 6.87, 6.29, 7.49, 11.6, 8.9, 9.95, 8.24, 3.86]\n",
"\n",
"#1B_GZIP (6, 4000, 4000) 11GB\n",
"res_1B_GZIP = [8.14, 10.4, 4.23, 9.46, 7.93, 4.16, 9.16, 6.36, 5.56, 5.42]\n",
"\n",
"#1A_LZF_C1 (4000, 4000, 6) chunked (4000, 4000, 1)\n",
"res_1A_LZF_C1 = [17.8, 18.0, 17.3, 17.1, 17.5, 17.4, 17.7, 17.4, 17.3, 17.7]\n",
"\n",
"#1A_GZIP_C1 (4000, 4000, 6) chunked (4000, 4000, 1)\n",
"res_1A_GZIP_C1 = [18.0, 17.8, 17.6, 17.3, 17.7, 17.4, 18.3, 17.5, 17.7, 17.9]\n",
"\n",
"#1B_LZF_C1 (6, 4000, 4000) chunked (1, 4000, 4000)\n",
"res_1B_LZF_C1 = [1.16, 0.89, 0.75, 0.72, 0.74, 0.79, 0.64, 0.94, 0.60, 0.76]\n",
"\n",
"#1B_GZIP_C1 (6, 4000, 4000) chunked (1, 4000, 4000)\n",
"res_1B_GZIP_C1 = [1.65, 2.01, 1.76, 0.82, 0.68, 1.12, 0.86, 1.17, 0.73, 1.25]\n",
"\n",
"\n",
"# Read 6 bands image 8 tiles\n",
"\n",
"#0_geotiff\n",
"res_0_geotiff = [43.5, 48.3, 48.5, 41.5, 33.8, 34.6, 36.7, 30.3, 36.2, 37.6]\n",
"\n",
"#1A_hdf5 (4000, 4000, 6)\n",
"res_1A_hdf5 = [6.51, 8.58, 7.78, 8.9, 7.99, 7.46, 3.93, 7.73, 9.46, 7.59]\n",
"\n",
"#1B_hdf5 (6, 4000, 4000)\n",
"res_1B_hdf5 = [8.66, 8.24, 5.98, 6.25, 7.32, 6.21, 7.31, 8.4, 7.19, 7.59]"
]
},
{
"cell_type": "code",
"execution_count": 230,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"78\n",
"CPU times: user 2.42 s, sys: 83 ms, total: 2.5 s\n",
"Wall time: 2.68 s\n"
]
}
],
"source": [
"#timeseries is composed of 80 tiles\n",
"def get_timeseries(path):\n",
" paths = get_files(path)\n",
" f_idxs = get_rdm_list(len(paths)/4, 0, len(paths)-1)\n",
" band = randint(0, 5)\n",
" x = randint(0, 3999)\n",
" y = randint(0, 3999)\n",
" \n",
" for f_idx in f_idxs:\n",
" with h5py.File(paths[f_idx], 'r') as h5f:\n",
" value = h5f[\"NBAR\"][x, y, band]\n",
" \n",
"def get_gdal_timeseries(path):\n",
"\n",
" paths = get_files(path)\n",
" print len(paths)/4\n",
" f_idxs = get_rdm_list(len(paths)/4, 0, len(paths)-1)\n",
" band = randint(0, 5)\n",
" x = randint(0, 3999)\n",
" y = randint(0, 3999)\n",
" \n",
" for f_idx in f_idxs:\n",
" image = gdal.Open(paths[f_idx])\n",
" n = image.GetRasterBand(band+1).ReadAsArray(x, y, 1, 1)\n",
" \n",
" \n",
"%time get_gdal_timeseries('/g/data1/ep1_2/z00/prl900/hdf5_test/1A_GZIP_C1/')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"\n",
"\n",
"## Create a timeseries of 80 images\n",
"\n",
"#0_geotiff\n",
"res_0_geotiff = [3.35, 2.07, 1.47, 1.82, 0.92, 1.0, 1.8, 1.12, 1.26, 0.82]\n",
"\n",
"#1A_hdf5 (4000, 4000, 6)\n",
"res_1A_hdf5 = [0.23, 0.22, 0.17, 0.28, 0.2, 0.38, 0.25, 0.25, 0.22, 0.26]\n",
"\n",
"#1B_hdf5 (6, 4000, 4000)\n",
"res_1B_hdf5 = [0.24, 0.2, 0.23, 0.22, 0.28, 0.13, 0.37, 0.2, 0.15, 0.16]\n",
"\n",
"#1A_LZF (4000, 4000, 6) 18GB\n",
"res_1A_LZF = [6.88, 5.75, 4.22, 4.56, 4.38, 3.53, 2.88, 3.11, 3.14, 3.04]\n",
"\n",
"#1A_GZIP (4000, 4000, 6) 13GB\n",
"res_1A_GZIP = [7.12, 6.1, 5.15, 4.47, 3.6, 4.17, 3.38, 3.19, 3.37, 3.09]\n",
"\n",
"#1B_LZF (6, 4000, 4000) 16GB\n",
"res_1B_LZF = [54.1, 52.2, 50.9, 41.6, 31.6, 34.7, 34.6, 38.3, 33.4, 32.1]\n",
"\n",
"#1B_GZIP (6, 4000, 4000) 11GB\n",
"res_1B_GZIP = [63.3, 71, 61, 53.4, 53.8, 50.1, 48.9, 49.2, 48.8, 50.5]\n",
"\n",
"#1A_LZF_C1 (4000, 4000, 6) chunked (4000, 4000, 1)\n",
"res_1A_LZF_C1 = [6.35, 5.23, 5.02, 3.66, 3.45, 3.68, 3.08, 3.2, 2.9, 2.86]\n",
"\n",
"#1A_GZIP_C1 (4000, 4000, 6) chunked (4000, 4000, 1)\n",
"res_1A_GZIP_C1 = [4.37, 4.12, 3.89, 3.45, 3.44, 3.12, 2.82, 2.79, 3.05, 2.68]\n",
"\n",
"#1B_LZF_C1 (6, 4000, 4000) chunked (1, 4000, 4000)\n",
"res_1B_LZF_C1 = [14.3, 15.5, 11.4, 10.2, 8.9, 10.6, 12.1, 9.56, 11.0, 6.33]\n",
"\n",
"#1B_GZIP_C1 (6, 4000, 4000) chunked (1, 4000, 4000)\n",
"res_1B_GZIP_C1 = [15.5, 13.9, 12.0, 16.0, 17.0, 11.7, 11.7, 15.2, 11.5, 10.8]"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import os.path\n",
"\n",
"def get_files(path):\n",
" path = \"{}*.h5\".format(path)\n",
" return glob.glob(path)\n",
"\n",
"def get_filename(path):\n",
" return path.split('/')[-1]\n",
"\n",
"def repack(in_folder, out_folder):\n",
" exp_path = \"/g/data1/ep1_2/z00/prl900/hdf5_test/\"\n",
" in_paths = get_files(exp_path + in_folder + '/')\n",
" \n",
" for in_path in in_paths:\n",
" if not os.path.isfile(exp_path + out_folder + '/' + get_filename(in_path)):\n",
" with h5py.File(in_path, 'r') as h5i:\n",
" cube = h5i[\"NBAR\"].value\n",
" with h5py.File(exp_path + out_folder + '/' + get_filename(in_path), 'w') as h5o:\n",
" h5o.create_dataset(\"NBAR\", data=cube, compression='gzip', chunks=(4000, 4000, 1))\n",
" \n",
"repack('1A_hdf5', '1A_GZIP_C1')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment