Skip to content

Instantly share code, notes, and snippets.

@jreadey
Created July 10, 2018 17:03
Show Gist options
  • Save jreadey/b13d4109595fdc3cd93eb557e77124c8 to your computer and use it in GitHub Desktop.
Save jreadey/b13d4109595fdc3cd93eb557e77124c8 to your computer and use it in GitHub Desktop.
NCEP3 Loader
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! hsinfo"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# read a file containing the contents of the source bucket\n",
"# (generated by s3cmd ls s3://hdfgroup/data/ncep3_daily/)\\\n",
"s3_files = []\n",
"with open(\"/home/jovyan/NCEP3/s3_contents.txt\") as fp:\n",
" line = fp.readline()\n",
" while line:\n",
" fields = line.split()\n",
" s3_files.append(fields[3])\n",
" line = fp.readline()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"num_files = len(s3_files)\n",
"print(f\"{num_files} S3 files will be aggregatted\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"s3_files.sort()\n",
"len(s3_files)\n",
"s3_files[0]\n",
"import os\n",
"os.system(f\"ls\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create initial domain by downloading GSSTF_NCEP.3.1987.07.01.he5 and running the following:\n",
"`$ hsload --nodata GSSTF_NCEP.3.1987.07.01.he5 /shared/NASA/NCEP3/ncep3.he5`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_group = f[\"/HDFEOS/GRIDS/NCEP/Data Fields\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"list(data_group)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for link in data_group:\n",
" del data_group[link]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dsets = ['/HDFEOS/GRIDS/NCEP/Data Fields/Psea_level', \n",
" '/HDFEOS/GRIDS/NCEP/Data Fields/Qsat', \n",
" '/HDFEOS/GRIDS/NCEP/Data Fields/SST', \n",
" '/HDFEOS/GRIDS/NCEP/Data Fields/Tair_2m']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"num_files = 7850"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for ds_path in dsets:\n",
" ds = f.create_dataset(ds_path,\n",
" dtype='f4', \n",
" shape=(num_files, 720, 1440), \n",
" chunks=(1, 720, 1440))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! hsls -r /shared/NASA/NCEP3/ncep3.he5"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import h5py\n",
"file_counter = 0\n",
"for s3path in s3_files:\n",
" \n",
" print(f\"Processing {s3path}\")\n",
" # download file from s3 so we can open locally\n",
" os.system(f\"s3cmd get {s3path}\")\n",
" index = s3path.rfind('/') + 1\n",
" fname = s3path[index:] # the downloaded filename will be text after last slash\n",
" with h5py.File(fname, 'r') as he5f: \n",
" for ds_path in dsets:\n",
" src_ds = he5f[ds_path]\n",
" aggr_dset = f[ds_path]\n",
" print(f\"setting {ds_path}[{file_counter},:,:]\")\n",
" aggr_dset[file_counter,:,:] = src_ds[...]\n",
" file_counter += 1\n",
" os.system(f\"rm {fname}\") "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment