Skip to content

Instantly share code, notes, and snippets.

@jreadey
Created December 13, 2023 02:56
Show Gist options
  • Save jreadey/11790ac7329c4bb32be3cee3340d01fc to your computer and use it in GitHub Desktop.
Save jreadey/11790ac7329c4bb32be3cee3340d01fc to your computer and use it in GitHub Desktop.
access NREL NSRDB data
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import h5py\n",
"import h5pyd\n",
"import numpy as np\n",
"import random\n",
"import s3fs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! hsls -H -v --bucket nrel-pds-hsds /nrel/nsrdb/conus/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! hsls --bucket nrel-pds-hsds /nrel/nsrdb/conus/nsrdb_conus_2020.h5"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! hsls --bucket nrel-pds-hsds --dataset wind_speed -v /nrel/nsrdb/conus/nsrdb_conus_2020.h5"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"use_hsds = True\n",
"if use_hsds:\n",
" f = h5pyd.File(\"/nrel/nsrdb/conus/nsrdb_conus_2020.h5\", bucket=\"nrel-pds-hsds\")\n",
"else:\n",
" s3 = s3fs.S3FileSystem()\n",
" f = h5py.File(s3.open(\"s3://nrel-pds-nsrdb/conus/nsrdb_conus_pv_2020.h5\", \"rb\"))\n",
" \n",
"f.id.id"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"list(f)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dset = f[\"wind_speed\"]\n",
"dset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for k in dset.attrs:\n",
" print(f\"{k} : {dset.attrs[k]}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"np.prod(dset.shape) * dset.dtype.itemsize # ~ 58 GB's"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dset.chunks"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# calculate the number of chunks in the dataset\n",
"(dset.shape[0] // dset.chunks[0]) * (dset.shape[1] // dset.chunks[1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# read a random 2MB selection\n",
"x = random.randint(0, dset.shape[0] - 1000)\n",
"y = random.randint(0, dset.shape[1] - 1000)\n",
"(x,y)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# read into array - hits 2-4 chunks\n",
"%time arr = dset[x:x+1000, y:y+1000]\n",
"arr"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"arr.min(), arr.max(), arr.mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# read one row at random\n",
"index = random.randint(0, dset.shape[0])\n",
"index"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# nrows = dset.shape[1]\n",
"nrows = 500_000 # speed things up a bit\n",
"# number of chunks we'll need to read for the following selection...\n",
"nrows // dset.chunks[1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%time arr = dset[index,:nrows]\n",
"arr"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"arr.min(), arr.max(), arr.mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dset.id.dcpl_json"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"chunk_table_id = dset.id.dcpl_json[\"layout\"][\"chunk_table\"]\n",
"chunk_table = f[f\"datasets/{chunk_table_id}\"]\n",
"chunk_table"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# one chunk of chunks\n",
"chunk_table.chunks"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ~3 MB to store chunk offsets + size for this dataset\n",
"np.prod(chunk_table.shape) * chunk_table.dtype.itemsize"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"chunk_table[:4, 0] # 4 chunktable entries"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "py39",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment