Skip to content

Instantly share code, notes, and snippets.

@jbusecke
Created September 27, 2023 20:58
Show Gist options
  • Save jbusecke/2cd85c532682cf3dd7e0b693bdb66cad to your computer and use it in GitHub Desktop.
Save jbusecke/2cd85c532682cf3dd7e0b693bdb66cad to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "bb89df8d-3a0b-478c-a7fc-f846b673d819",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# !pip install pip install git+https://github.com/jbusecke/pangeo-forge-esgf.git@beam-refactor"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "929af61e-0d0b-4a18-a1c7-7953d1c35faf",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import gcsfs\n",
"import zarr\n",
"from typing import List\n",
"cache_bucket = \"leap-scratch/data-library/cache\"\n",
"target_bucket = \"leap-persistent-ro/data-library/cmip6-testing\"\n",
"\n",
"\n",
"def get_job_details(jobid: str):\n",
" pass\n",
"\n",
"def get_cached_files(iid: str) -> List[str]:\n",
" fs = gcsfs.GCSFileSystem()\n",
" glob_wildcard = '*'.join(['']+iid.lower().split('.')[2:]+[''])\n",
" # this takes quite long...\n",
" all_files = fs.glob(f\"{cache_bucket}/{glob_wildcard}\")\n",
" return [f\"gs://{f}\" for f in all_files]\n",
"\n",
"def get_written_store(jobid: str, iid: str) -> str:\n",
" subfolder = '-'.join(jobid.split('-')[:-1])\n",
" return f\"gs://{target_bucket}/{subfolder}/{iid}.zarr\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "48b89aa4-c348-4904-a67f-04a396ba0e18",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"iids = ['CMIP6.ScenarioMIP.CSIRO-ARCCSS.ACCESS-CM2.ssp585.r1i1p1f1.day.pr.gn.v20210317',\n",
"'CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp585.r4i1p1f1.day.sfcWind.gn.v20210318',\n",
"'CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp585.r6i1p1f1.day.sfcWind.gn.v20210318',\n",
"'CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp585.r4i1p1f1.day.pr.gn.v20210318',\n",
"'CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp585.r10i1p1f1.day.pr.gn.v20210318',\n",
"'CMIP6.CMIP.MRI.MRI-ESM2-0.historical.r2i1p1f1.day.pr.gn.v20190603',\n",
"'CMIP6.CMIP.MRI.MRI-ESM2-0.historical.r3i1p1f1.day.pr.gn.v20190603',\n",
"'CMIP6.CMIP.MRI.MRI-ESM2-0.historical.r4i1p1f1.day.pr.gn.v20190603',\n",
"'CMIP6.CMIP.MRI.MRI-ESM2-0.historical.r5i1p1f1.day.pr.gn.v20190603',\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e892d48f-a0b1-42b1-9e8f-268fd4fe8bfe",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"pangeo_forge_esgf.recipe_inputs - INFO - Checking responsiveness of search_nodes=['http://esgf-node.llnl.gov/esg-search/search', 'http://esgf-data.dkrz.de/esg-search/search', 'http://esgf-node.ipsl.upmc.fr/esg-search/search', 'http://esgf-index1.ceda.ac.uk/esg-search/search', 'http://esg-dn1.nsc.liu.se/esg-search/search', 'http://esgf.nci.org.au/esg-search/search']\n",
"pangeo_forge_esgf.recipe_inputs - INFO - responsive_search_nodes=['http://esgf-node.llnl.gov/esg-search/search', 'http://esgf-data.dkrz.de/esg-search/search', 'http://esgf-node.ipsl.upmc.fr/esg-search/search', 'http://esg-dn1.nsc.liu.se/esg-search/search', 'http://esgf.nci.org.au/esg-search/search']\n",
"pangeo_forge_esgf.recipe_inputs - INFO - Requesting urls\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 45/45 [00:10<00:00, 4.25it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"pangeo_forge_esgf.recipe_inputs - INFO - Processing responses\n",
"pangeo_forge_esgf.recipe_inputs - INFO - Processing responses: Expected files per iid\n",
"pangeo_forge_esgf.recipe_inputs - INFO - Processing responses: Check for missing iids\n",
"pangeo_forge_esgf.recipe_inputs - WARNING - Not able to find results for the following 4 iids: ['CMIP6.CMIP.MRI.MRI-ESM2-0.historical.r5i1p1f1.day.pr.gn.v20190603', 'CMIP6.CMIP.MRI.MRI-ESM2-0.historical.r3i1p1f1.day.pr.gn.v20190603', 'CMIP6.CMIP.MRI.MRI-ESM2-0.historical.r2i1p1f1.day.pr.gn.v20190603', 'CMIP6.CMIP.MRI.MRI-ESM2-0.historical.r4i1p1f1.day.pr.gn.v20190603']\n",
"pangeo_forge_esgf.recipe_inputs - INFO - Processing responses: Flatten results\n",
"pangeo_forge_esgf.recipe_inputs - INFO - Processing responses: Group results\n",
"pangeo_forge_esgf.recipe_inputs - INFO - Choosing one url per file\n",
"pangeo_forge_esgf.recipe_inputs - WARNING - This method seems to be unreliable for getting many urls. \n",
"If you are getting less datasets than you expect, try 'first' instead.\n",
"pangeo_forge_esgf.recipe_inputs - INFO - Find first responsive url for each file\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 8/8 [00:00<00:00, 12.98it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"pangeo_forge_esgf.recipe_inputs - WARNING - Was not able to construct url list for (4/9) iids\n",
"pangeo_forge_esgf.recipe_inputs - INFO - Was not able to construct url list for the following iids:\n",
"pangeo_forge_esgf.recipe_inputs - INFO - {'CMIP6.CMIP.MRI.MRI-ESM2-0.historical.r5i1p1f1.day.pr.gn.v20190603', 'CMIP6.CMIP.MRI.MRI-ESM2-0.historical.r3i1p1f1.day.pr.gn.v20190603', 'CMIP6.CMIP.MRI.MRI-ESM2-0.historical.r2i1p1f1.day.pr.gn.v20190603', 'CMIP6.CMIP.MRI.MRI-ESM2-0.historical.r4i1p1f1.day.pr.gn.v20190603'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"# query pangeo-forge-esgf for the urls\n",
"# TODO: I need to write the order of urls into the attrs\n",
"import logging\n",
"import pangeo_forge_esgf\n",
"from pangeo_forge_esgf import get_urls_from_esgf, setup_logging\n",
"\n",
"# setup_logging('DEBUG')\n",
"setup_logging('INFO')\n",
"\n",
"url_dict = await get_urls_from_esgf(iids, limit_per_host=50, max_concurrency=50, choose_url='first_responsive')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "8f290281-ba3a-45cf-b436-112170ec3439",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['CMIP6.ScenarioMIP.CSIRO-ARCCSS.ACCESS-CM2.ssp585.r1i1p1f1.day.pr.gn.v20210317',\n",
" 'CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp585.r4i1p1f1.day.sfcWind.gn.v20210318',\n",
" 'CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp585.r6i1p1f1.day.sfcWind.gn.v20210318',\n",
" 'CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp585.r4i1p1f1.day.pr.gn.v20210318',\n",
" 'CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp585.r10i1p1f1.day.pr.gn.v20210318']"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iids_with_urls = list(url_dict.keys())\n",
"iids_with_urls"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "b23750b9-1266-4e9d-ae49-e6a10ab75146",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# from ../cmip6-leap-feedstock/catalog_utils import convert_cmip6_df_to_iid_df\n",
"# FIXME: Import this properly\n",
"import pandas as pd\n",
"def _maybe_join(iterable):\n",
" assert len(iterable) == 2 \n",
" dcpp_init_year = iterable[0]\n",
" member_id = iterable[1]\n",
" if not pd.isnull(dcpp_init_year):\n",
" return f\"{dcpp_init_year}-{member_id}\"\n",
" else:\n",
" return member_id\n",
"\n",
"def convert_cmip6_df_to_iid_df(df: pd.DataFrame) -> pd.DataFrame:\n",
" # now remove the ones already in the pangeo catalog\n",
"\n",
" df['variant_label'] = df[['dcpp_init_year', 'member_id']].agg(_maybe_join, axis=1)\n",
" df['version'] = 'v'+df['version'].astype(str)\n",
" df['instance_id'] = df[['activity_id', 'institution_id', 'source_id', 'experiment_id',\n",
" 'variant_label', 'table_id', 'variable_id', 'grid_label', 'version']].astype(str).agg('.'.join, axis=1).tolist()\n",
" df['instance_id'] = 'CMIP6.'+df['instance_id']\n",
" df['store'] = df['zstore']\n",
" # add current time as bigquery timestamp\n",
" df['timestamp'] = pd.Timestamp.now(tz='UTC')\n",
" df = df[['instance_id','store', 'timestamp']]\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "6c7f4e29-cbed-4ebd-9e49-9e2e3b76d8e1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['CMIP6.HighResMIP.CMCC.CMCC-CM2-HR4.highresSST-present.r1i1p1f1.Amon.ps.gn.v20170706',\n",
" 'CMIP6.HighResMIP.CMCC.CMCC-CM2-HR4.highresSST-present.r1i1p1f1.Amon.rsds.gn.v20170706',\n",
" 'CMIP6.HighResMIP.CMCC.CMCC-CM2-HR4.highresSST-present.r1i1p1f1.Amon.rlus.gn.v20170706',\n",
" 'CMIP6.HighResMIP.CMCC.CMCC-CM2-HR4.highresSST-present.r1i1p1f1.Amon.rlds.gn.v20170706',\n",
" 'CMIP6.HighResMIP.CMCC.CMCC-CM2-HR4.highresSST-present.r1i1p1f1.Amon.psl.gn.v20170706',\n",
" 'CMIP6.HighResMIP.CMCC.CMCC-CM2-HR4.highresSST-present.r1i1p1f1.Amon.hurs.gn.v20170706',\n",
" 'CMIP6.HighResMIP.CMCC.CMCC-CM2-HR4.highresSST-present.r1i1p1f1.Amon.huss.gn.v20170706',\n",
" 'CMIP6.HighResMIP.CMCC.CMCC-CM2-HR4.highresSST-present.r1i1p1f1.Amon.hus.gn.v20170706',\n",
" 'CMIP6.HighResMIP.CMCC.CMCC-CM2-HR4.highresSST-present.r1i1p1f1.Amon.hfss.gn.v20170706',\n",
" 'CMIP6.HighResMIP.CMCC.CMCC-CM2-HR4.highresSST-present.r1i1p1f1.Amon.rsus.gn.v20170706']"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import intake\n",
"# col = intake.open_esm_datastore(\n",
"# \"https://storage.googleapis.com/leap-persistent-ro/data-library/catalogs/cmip6-test/leap-pangeo-cmip6-noqc-test.json\"\n",
"# )\n",
"from xmip.utils import google_cmip_col\n",
"col = google_cmip_col()\n",
"\n",
"df = convert_cmip6_df_to_iid_df(col.df)\n",
"iids_in_catalog = df['instance_id'].tolist()\n",
"iids_in_catalog[:10]"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "e25a3deb-7f75-4708-b03c-732a05f2a76a",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"set()"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"set(iids_with_urls)-set(iids_in_catalog)"
]
},
{
"cell_type": "markdown",
"id": "f1dddfb3-fa97-4407-8dc2-4f62341c3b6e",
"metadata": {
"tags": []
},
"source": [
"So these are all in the old catalog!"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "51ee2a2f-a3a3-4a36-baf4-4fd0ff94e632",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment