Created
December 13, 2023 02:56
-
-
Save jreadey/11790ac7329c4bb32be3cee3340d01fc to your computer and use it in GitHub Desktop.
access NREL NSRDB data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import h5py\n", | |
"import h5pyd\n", | |
"import numpy as np\n", | |
"import random\n", | |
"import s3fs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"! hsls -H -v --bucket nrel-pds-hsds /nrel/nsrdb/conus/" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"! hsls --bucket nrel-pds-hsds /nrel/nsrdb/conus/nsrdb_conus_2020.h5" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"! hsls --bucket nrel-pds-hsds --dataset wind_speed -v /nrel/nsrdb/conus/nsrdb_conus_2020.h5" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"use_hsds = True\n", | |
"if use_hsds:\n", | |
" f = h5pyd.File(\"/nrel/nsrdb/conus/nsrdb_conus_2020.h5\", bucket=\"nrel-pds-hsds\")\n", | |
"else:\n", | |
" s3 = s3fs.S3FileSystem()\n", | |
" f = h5py.File(s3.open(\"s3://nrel-pds-nsrdb/conus/nsrdb_conus_pv_2020.h5\", \"rb\"))\n", | |
" \n", | |
"f.id.id" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"list(f)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dset = f[\"wind_speed\"]\n", | |
"dset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for k in dset.attrs:\n", | |
" print(f\"{k} : {dset.attrs[k]}\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"np.prod(dset.shape) * dset.dtype.itemsize # ~ 58 GB's" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dset.chunks" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# calculate the number of chunks in the dataset\n", | |
"(dset.shape[0] // dset.chunks[0]) * (dset.shape[1] // dset.chunks[1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# read a random 2MB selection\n", | |
"x = random.randint(0, dset.shape[0] - 1000)\n", | |
"y = random.randint(0, dset.shape[1] - 1000)\n", | |
"(x,y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# read into array - hits 2-4 chunks\n", | |
"%time arr = dset[x:x+1000, y:y+1000]\n", | |
"arr" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"arr.min(), arr.max(), arr.mean()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# read one row at random\n", | |
"index = random.randint(0, dset.shape[0])\n", | |
"index" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# nrows = dset.shape[1]\n", | |
"nrows = 500_000 # speed things up a bit\n", | |
"# number of chunks we'll need to read for the following selection...\n", | |
"nrows // dset.chunks[1]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%time arr = dset[index,:nrows]\n", | |
"arr" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"arr.min(), arr.max(), arr.mean()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dset.id.dcpl_json" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"chunk_table_id = dset.id.dcpl_json[\"layout\"][\"chunk_table\"]\n", | |
"chunk_table = f[f\"datasets/{chunk_table_id}\"]\n", | |
"chunk_table" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# one chunk of chunks\n", | |
"chunk_table.chunks" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# ~3 MB to store chunk offsets + size for this dataset\n", | |
"np.prod(chunk_table.shape) * chunk_table.dtype.itemsize" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"chunk_table[:4, 0] # 4 chunktable entries" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "py39", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.13" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment