jreadey/nrel_ex.ipynb

## nrel_ex.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import h5py\n",
    "import h5pyd\n",
    "import numpy as np\n",
    "import random\n",
    "import s3fs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! hsls -H -v --bucket nrel-pds-hsds /nrel/nsrdb/conus/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! hsls --bucket nrel-pds-hsds /nrel/nsrdb/conus/nsrdb_conus_2020.h5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! hsls --bucket nrel-pds-hsds --dataset wind_speed -v /nrel/nsrdb/conus/nsrdb_conus_2020.h5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "use_hsds = True\n",
    "if use_hsds:\n",
    "    f = h5pyd.File(\"/nrel/nsrdb/conus/nsrdb_conus_2020.h5\", bucket=\"nrel-pds-hsds\")\n",
    "else:\n",
    "    s3 = s3fs.S3FileSystem()\n",
    "    f = h5py.File(s3.open(\"s3://nrel-pds-nsrdb/conus/nsrdb_conus_pv_2020.h5\", \"rb\"))\n",
    "    \n",
    "f.id.id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dset = f[\"wind_speed\"]\n",
    "dset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for k in dset.attrs:\n",
    "    print(f\"{k} : {dset.attrs[k]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.prod(dset.shape) * dset.dtype.itemsize  # ~ 58 GB's"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dset.chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate the number of chunks in the dataset\n",
    "(dset.shape[0] // dset.chunks[0]) * (dset.shape[1] // dset.chunks[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read a random 2MB selection\n",
    "x = random.randint(0, dset.shape[0] - 1000)\n",
    "y = random.randint(0, dset.shape[1] - 1000)\n",
    "(x,y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read into array - hits 2-4 chunks\n",
    "%time arr = dset[x:x+1000, y:y+1000]\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr.min(), arr.max(), arr.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read one row at random\n",
    "index = random.randint(0, dset.shape[0])\n",
    "index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# nrows = dset.shape[1]\n",
    "nrows = 500_000  # speed things up a bit\n",
    "# number of chunks we'll need to read for the following selection...\n",
    "nrows // dset.chunks[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time arr = dset[index,:nrows]\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr.min(), arr.max(), arr.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dset.id.dcpl_json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "chunk_table_id  = dset.id.dcpl_json[\"layout\"][\"chunk_table\"]\n",
    "chunk_table = f[f\"datasets/{chunk_table_id}\"]\n",
    "chunk_table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# one chunk of chunks\n",
    "chunk_table.chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ~3 MB to store chunk offsets + size for this dataset\n",
    "np.prod(chunk_table.shape) * chunk_table.dtype.itemsize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "chunk_table[:4, 0]  # 4 chunktable entries"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py39",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import h5py\n",
	"import h5pyd\n",
	"import numpy as np\n",
	"import random\n",
	"import s3fs"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"! hsls -H -v --bucket nrel-pds-hsds /nrel/nsrdb/conus/"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"! hsls --bucket nrel-pds-hsds /nrel/nsrdb/conus/nsrdb_conus_2020.h5"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"! hsls --bucket nrel-pds-hsds --dataset wind_speed -v /nrel/nsrdb/conus/nsrdb_conus_2020.h5"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"use_hsds = True\n",
	"if use_hsds:\n",
	" f = h5pyd.File(\"/nrel/nsrdb/conus/nsrdb_conus_2020.h5\", bucket=\"nrel-pds-hsds\")\n",
	"else:\n",
	" s3 = s3fs.S3FileSystem()\n",
	" f = h5py.File(s3.open(\"s3://nrel-pds-nsrdb/conus/nsrdb_conus_pv_2020.h5\", \"rb\"))\n",
	" \n",
	"f.id.id"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"list(f)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"dset = f[\"wind_speed\"]\n",
	"dset"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"for k in dset.attrs:\n",
	" print(f\"{k} : {dset.attrs[k]}\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"np.prod(dset.shape) * dset.dtype.itemsize # ~ 58 GB's"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"dset.chunks"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# calculate the number of chunks in the dataset\n",
	"(dset.shape[0] // dset.chunks[0]) * (dset.shape[1] // dset.chunks[1])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# read a random 2MB selection\n",
	"x = random.randint(0, dset.shape[0] - 1000)\n",
	"y = random.randint(0, dset.shape[1] - 1000)\n",
	"(x,y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# read into array - hits 2-4 chunks\n",
	"%time arr = dset[x:x+1000, y:y+1000]\n",
	"arr"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"arr.min(), arr.max(), arr.mean()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# read one row at random\n",
	"index = random.randint(0, dset.shape[0])\n",
	"index"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# nrows = dset.shape[1]\n",
	"nrows = 500_000 # speed things up a bit\n",
	"# number of chunks we'll need to read for the following selection...\n",
	"nrows // dset.chunks[1]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%time arr = dset[index,:nrows]\n",
	"arr"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"arr.min(), arr.max(), arr.mean()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"dset.id.dcpl_json"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"chunk_table_id = dset.id.dcpl_json[\"layout\"][\"chunk_table\"]\n",
	"chunk_table = f[f\"datasets/{chunk_table_id}\"]\n",
	"chunk_table"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# one chunk of chunks\n",
	"chunk_table.chunks"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# ~3 MB to store chunk offsets + size for this dataset\n",
	"np.prod(chunk_table.shape) * chunk_table.dtype.itemsize"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"chunk_table[:4, 0] # 4 chunktable entries"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "py39",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.13"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}