Skip to content

Instantly share code, notes, and snippets.

@scottyhq
Created February 20, 2019 20:15
Show Gist options
  • Save scottyhq/790bf19c7811b5c6243ce37aae252ca1 to your computer and use it in GitHub Desktop.
Save scottyhq/790bf19c7811b5c6243ce37aae252ca1 to your computer and use it in GitHub Desktop.
experimenting with remote HDF5 files in xarray
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load HDF5 data w/ xarray\n",
"\n",
"A lot of NASA data is stored in HDF5 format. One example is data from the Sentinel-1 radar mission, which will be similar to the upcoming NISAR mission.\n",
"\n",
"https://aria.jpl.nasa.gov/node/97\n",
"\n",
"This notebook illustrates loading the data into xarray for analysis"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import xarray as xr\n",
"import h5netcdf\n",
"import rasterio\n",
"import h5py\n",
"import gcsfs\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.11.1+64.g612d390f.dirty\n",
"0.6.2\n",
"1.0.18\n",
"2.9.0\n",
"0.2.0\n"
]
}
],
"source": [
"print(xr.__version__)\n",
"print(h5netcdf.__version__)\n",
"print(rasterio.__version__)\n",
"print(h5py.__version__)\n",
"print(gcsfs.__version__)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Copied data from ASF to Google Storage bucket\n",
"fs = gcsfs.GCSFileSystem()\n",
"images = fs.ls('pangeo-data/grfn-v2/137/')\n",
"fileObj = fs.open('pangeo-data/grfn-v2/137/S1-GUNW-A-R-137-tops-20181129_20181123-020010-43220N_41518N-PP-e2c7-v2_0_0.nc')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Option 1) Copy file locally and read w/ xarray\n",
"gsPath = 'pangeo-data/grfn-v2/137/S1-GUNW-A-R-137-tops-20181129_20181123-020010-43220N_41518N-PP-e2c7-v2_0_0.nc'\n",
"localPath = os.path.basename(gsPath)\n",
"#fs.get(gsPath, localPath)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# Data arrays are stored in \"science group\"\n",
"#da = xr.open_dataset(localPath, group='/science/grids/data', engine='h5netcdf')\n",
"#da "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# Seems that h5py >2.9.0 can handle file-like-objects:\n",
"# https://github.com/h5py/h5py/pull/1105\n",
"h5 = h5py.File(fileObj, 'r')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<KeysViewHDF5 ['product_type', 'Conventions', 'title', 'version', 'author', 'institution', 'source', 'references', 'ogr_geometry_field', 'ogr_layer_name', 'ogr_layer_type']>\n",
"ItemsViewHDF5(<HDF5 group \"/science/grids/data\" (7 members)>)\n",
"(682, 1386)\n"
]
}
],
"source": [
"# Works but slow from home wifi (likely due to issues w/ number of network requests required)\n",
"# http://matthewrocklin.com/blog/work/2018/02/06/hdf-in-the-cloud\n",
"print(h5.attrs.keys())\n",
"ds = h5['science/grids/data']\n",
"print(ds.items())\n",
"print(ds['coherence'].chunks)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<xarray.Dataset>\n",
"Dimensions: (latitude: 2045, longitude: 4158)\n",
"Coordinates:\n",
" * longitude (longitude) float64 -123.1 -123.1 ... -119.6 -119.6\n",
" * latitude (latitude) float64 43.22 43.22 43.22 ... 41.52 41.52\n",
"Data variables:\n",
" crs int32 ...\n",
" unwrappedPhase (latitude, longitude) float32 ...\n",
" coherence (latitude, longitude) float32 ...\n",
" connectedComponents (latitude, longitude) float32 ...\n",
" amplitude (latitude, longitude) float32 ..."
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# but, can we open this w/ xarray anyway? Yes! with modifications to xarray and h5netcdf\n",
"ds = xr.open_dataset(fileObj, group='/science/grids/data', engine='h5netcdf')\n",
"ds "
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<xarray.Dataset>\n",
"Dimensions: (latitude: 2045, longitude: 4158)\n",
"Coordinates:\n",
" * longitude (longitude) float64 -123.1 -123.1 ... -119.6 -119.6\n",
" * latitude (latitude) float64 43.22 43.22 43.22 ... 41.52 41.52\n",
"Data variables:\n",
" crs int32 ...\n",
" unwrappedPhase (latitude, longitude) float32 dask.array<shape=(2045, 4158), chunksize=(682, 1386)>\n",
" coherence (latitude, longitude) float32 dask.array<shape=(2045, 4158), chunksize=(682, 1386)>\n",
" connectedComponents (latitude, longitude) float32 dask.array<shape=(2045, 4158), chunksize=(682, 1386)>\n",
" amplitude (latitude, longitude) float32 dask.array<shape=(2045, 4158), chunksize=(682, 1386)>"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Try as Dask array\n",
"ds = xr.open_dataset(fileObj, group='/science/grids/data', engine='h5netcdf',\n",
" chunks=dict(latitude=682, longitude=1386))\n",
"ds "
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.image.AxesImage at 0x111a44390>"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ds['coherence'].plot.imshow()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment