Skip to content

Instantly share code, notes, and snippets.

@jhamman
Created May 22, 2024 22:44
Show Gist options
  • Save jhamman/8381dd971d928bf220405057107562b1 to your computer and use it in GitHub Desktop.
Save jhamman/8381dd971d928bf220405057107562b1 to your computer and use it in GitHub Desktop.
Pangeo Showcase: Zarr-Python 3 demo
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "e4df2774-cbd3-4467-a9bd-c110cb3231a9",
"metadata": {},
"source": [
"# Pangeo Showcase: Zarr-Python 3 demo\n",
"\n",
"<img src=\"https://raw.githubusercontent.com/zarr-developers/zarr-logo/main/zarr-pink-stacked.svg\" alt=\"drawing\" width=\"250\"/>\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "da1ee91b-34dd-491c-a4fe-cd0932115ffc",
"metadata": {},
"outputs": [],
"source": [
"import zarr\n",
"\n",
"import asyncio\n",
"import numpy as np\n",
"import tempfile\n",
"from directory_tree import display_tree\n",
"from pprint import pprint"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "2de2eabb-12c0-49da-bd0f-759e4fba7f54",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'2.16.2.dev94+gfec7adf7.d20240521'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# note: some of what you see here is only available in a feature branch\n",
"# branch: jhamman:feature/top-level-api\n",
"# pr: zarr-python#1884\n",
"zarr.__version__"
]
},
{
"cell_type": "markdown",
"id": "8adf0a8e-a13b-4a63-8b40-0fc99437b763",
"metadata": {},
"source": [
"## The basics\n",
"\n",
"Most things will feel the same.\n",
"\n",
"- as much as possible, we're keeping the top level API the same\n",
"- expect some deprecations to parts of the v2 API\n",
"- some folks will notice changes to the Store API."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "683bbe24-f19e-487b-9c1f-3d9bd1103bd4",
"metadata": {},
"outputs": [],
"source": [
"# create a store\n",
"store = zarr.store.LocalStore(root=tempfile.mkdtemp())\n",
"\n",
"# create a root group\n",
"root = zarr.group(\n",
" store=store, # new in 3.0: transition to keyword only constructors (wip)\n",
" zarr_format=3, # new in 3.0: support for v2 and v3 specs\n",
" attributes={\"name\": \"demo root group\"} # new in 3.0: pass attributes at group creation time\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "88fd37b2-bf94-4373-a3fe-dea7975aee34",
"metadata": {},
"outputs": [],
"source": [
"# create an array\n",
"arr = root.create_array(\n",
" name=\"foo\",\n",
" shape=(10, 10),\n",
" chunks=(5, 5),\n",
" dtype='i4',\n",
" dimension_names=('x', 'y'), # new in 3.0: support array dimension names (v3 only)\n",
" attributes={'units': 'foo'} # new in 3.0: pass attributes at array creation time\n",
")\n",
"\n",
"# update an attribute\n",
"arr.attrs['name'] = 'foo-array'\n",
"\n",
"# and write data to it\n",
"arr[:] = np.random.randint(0, 10, size=arr.shape)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "166fdbce-8048-4e87-8d31-211552ba43bc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tmpgnnpfjwh/\n",
"├── foo/\n",
"│ ├── c/\n",
"│ │ ├── 0/\n",
"│ │ │ ├── 0\n",
"│ │ │ └── 1\n",
"│ │ └── 1/\n",
"│ │ ├── 0\n",
"│ │ └── 1\n",
"│ └── zarr.json\n",
"└── zarr.json\n"
]
}
],
"source": [
"# v3 spec store layout / metadata keys / chunk key encoding\n",
"display_tree(store.root)"
]
},
{
"cell_type": "markdown",
"id": "62ba08f3-31a5-44c7-a950-8c3ff40ac6b9",
"metadata": {},
"source": [
"## Some new things"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "28a5fa27-e8b2-48b2-a627-a6de70e27684",
"metadata": {},
"outputs": [],
"source": [
"# create a sharded array\n",
"from zarr.codecs import ShardingCodec, TransposeCodec, BytesCodec, BloscCodec\n",
"\n",
"shard_arr = root.create_array(\n",
" name=\"bar\",\n",
" shape=(100, 100),\n",
" chunks=(50, 50),\n",
" dtype='i4',\n",
" dimension_names=('x', 'y'), # new in 3.0: support array dimension names (v3 only)\n",
" attributes={'units': 'bar'}, # new in 3.0: pass attributes at array creation time\n",
" codecs=[ # new in 3.0: codec pipelines instead of filter/compressor (v3 only)\n",
" ShardingCodec( # new in 3.0: sharding codec\n",
" chunk_shape=(5, 5),\n",
" codecs=[\n",
" TransposeCodec(order=(0, 1)),\n",
" BytesCodec(),\n",
" BloscCodec(cname=\"lz4\"),\n",
" ],\n",
" index_location=\"start\",\n",
" )\n",
" ],\n",
")\n",
"\n",
"# and write data to it\n",
"shard_arr[:] = np.random.randint(0, 10, size=shard_arr.shape)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "5cd7b611-ac46-45a4-ba3f-503bb1d5c289",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tmpgnnpfjwh/\n",
"├── bar/\n",
"│ ├── c/\n",
"│ │ ├── 0/\n",
"│ │ │ ├── 0\n",
"│ │ │ └── 1\n",
"│ │ └── 1/\n",
"│ │ ├── 0\n",
"│ │ └── 1\n",
"│ └── zarr.json\n",
"├── foo/\n",
"│ ├── c/\n",
"│ │ ├── 0/\n",
"│ │ │ ├── 0\n",
"│ │ │ └── 1\n",
"│ │ └── 1/\n",
"│ │ ├── 0\n",
"│ │ └── 1\n",
"│ └── zarr.json\n",
"└── zarr.json\n"
]
}
],
"source": [
"# bar/c only has 4 objects despite there being 400 chunks!\n",
"display_tree(store.root)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "f1016065-3669-462e-a672-1d6965644555",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"b'{\"shape\": [10, 10], \"fill_value\": 0, \"chunk_grid\": {\"name\": \"regular\", \"configuration\": {\"chunk_shape\": [5, 5]}}, \"attributes\": {\"units\": \"foo\", \"name\": \"foo-array\"}, \"zarr_format\": 3, \"data_type\": \"int32\", \"chunk_key_encoding\": {\"name\": \"default\", \"configuration\": {\"separator\": \"/\"}}, \"codecs\": [{\"name\": \"bytes\", \"configuration\": {\"endian\": \"little\"}}], \"dimension_names\": [\"x\", \"y\"], \"node_type\": \"array\"}'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# the store interface is entirely async... `await` everything\n",
"# get a metadata object out of the store\n",
"(await store.get(\"foo/zarr.json\")).to_bytes()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "4c7446e6-fd15-4542-a3bf-c4f47701efeb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GroupMetadata(attributes={'name': 'demo root group'},\n",
" zarr_format=3,\n",
" node_type='group')\n",
"ArrayV3Metadata(shape=(10, 10),\n",
" fill_value=0,\n",
" chunk_grid=RegularChunkGrid(chunk_shape=(5, 5)),\n",
" attributes={'units': 'foo'},\n",
" zarr_format=3,\n",
" data_type=dtype('int32'),\n",
" chunk_key_encoding=DefaultChunkKeyEncoding(name='default',\n",
" separator='/'),\n",
" codecs=BatchedCodecPipeline(array_array_codecs=(),\n",
" array_bytes_codec=BytesCodec(endian=<Endian.little: 'little'>),\n",
" bytes_bytes_codecs=(),\n",
" batch_size=1),\n",
" dimension_names=('x', 'y'),\n",
" node_type='array')\n"
]
}
],
"source": [
"# new metadata DataClasses / property\n",
"pprint(root.metadata)\n",
"pprint(arr.metadata)\n",
"# pprint(shard_arr.metadata)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "2c3ef4c3-7eca-45a9-bad0-89ba1c945b5a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\u001b[0;31mSignature:\u001b[0m\n",
"\u001b[0mzarr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mArray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mstore\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'StoreLike'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mshape\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'ChunkCoords'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'npt.DTypeLike'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mzarr_format\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'ZarrFormat'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mfill_value\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Any | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mattributes\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'dict[str, JSON] | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mchunk_shape\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'ChunkCoords | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mchunk_key_encoding\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"ChunkKeyEncoding | tuple[Literal['default'], Literal['.', '/']] | tuple[Literal['v2'], Literal['.', '/']] | None\"\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mcodecs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Iterable[Codec | dict[str, JSON]] | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mdimension_names\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Iterable[str] | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'ChunkCoords | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mdimension_separator\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"Literal['.', '/'] | None\"\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"Literal['C', 'F'] | None\"\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mfilters\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'list[dict[str, JSON]] | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mcompressor\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'dict[str, JSON] | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mexists_ok\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;34m'Array'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mDocstring:\u001b[0m <no docstring>\n",
"\u001b[0;31mFile:\u001b[0m ~/Library/CloudStorage/Dropbox/src/zarr-python-2/src/zarr/array.py\n",
"\u001b[0;31mType:\u001b[0m method"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 100% type hint coverage\n",
"zarr.Array.create?"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "9693d955-29bd-4840-bb73-c367e194e5cc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'array': {'order': 'C'},\n",
" 'async': {'concurrency': None, 'timeout': None},\n",
" 'codec_pipeline': {'batch_size': 4}}\n"
]
}
],
"source": [
"# new global config (uses donfig)\n",
"with zarr.config.set({'codec_pipeline.batch_size': 4}):\n",
" zarr.config.pprint()"
]
},
{
"cell_type": "markdown",
"id": "ec27ba7c-988e-438a-944e-e8a989d820dc",
"metadata": {},
"source": [
"## New Async API\n",
"\n",
"Zarr-Python 3 will include an AsyncIO interface. Why? We want to take advantage of concurrency everywhere we can."
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "5603868c-37e6-4ec5-b788-dbb195efdb30",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AsyncGroup file:///var/folders/6_/834w1yzj49d6t6dz8lpx86fm0000gn/T/tmp3k55yj2w>"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# create a store\n",
"store = zarr.store.LocalStore(root=tempfile.mkdtemp())\n",
"\n",
"# create a root group\n",
"root = await zarr.api.asynchronous.group(\n",
" store=store, # new in 3.0: transition to keyword only constructors (wip)\n",
" zarr_format=3, # new in 3.0: support for v2 and v3 specs\n",
" attributes={\"name\": \"demo root group\"} # new in 3.0: pass attributes at group creation time\n",
")\n",
"root"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "ff255661-f03c-483c-8efe-1c536f5a7ba8",
"metadata": {},
"outputs": [],
"source": [
"# create 5 arrays concurrently\n",
"awaitables = []\n",
"for name in [\"foo\", \"bar\", \"spam\", \"baz\", \"qux\"]:\n",
" awaitables.append(\n",
" root.create_array(\n",
" path=name,\n",
" shape=(10, 10),\n",
" chunks=(5, 5),\n",
" dtype='i4',\n",
" attributes={'title': f'{name} demo'}\n",
" )\n",
" )\n",
"arrays = await asyncio.gather(*awaitables)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "8facc0fa-a980-4698-a614-fb18c16de7e9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['spam', 'qux', 'foo', 'baz', 'bar']"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"keys = [k async for k in root.array_keys()]\n",
"keys"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "815b19ab-a6e3-4b80-86ee-7e6e4434b092",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<AsyncArray file:///var/folders/6_/834w1yzj49d6t6dz8lpx86fm0000gn/T/tmp3k55yj2w/spam shape=(10, 10) dtype=int32>,\n",
" <AsyncArray file:///var/folders/6_/834w1yzj49d6t6dz8lpx86fm0000gn/T/tmp3k55yj2w/qux shape=(10, 10) dtype=int32>,\n",
" <AsyncArray file:///var/folders/6_/834w1yzj49d6t6dz8lpx86fm0000gn/T/tmp3k55yj2w/foo shape=(10, 10) dtype=int32>,\n",
" <AsyncArray file:///var/folders/6_/834w1yzj49d6t6dz8lpx86fm0000gn/T/tmp3k55yj2w/baz shape=(10, 10) dtype=int32>,\n",
" <AsyncArray file:///var/folders/6_/834w1yzj49d6t6dz8lpx86fm0000gn/T/tmp3k55yj2w/bar shape=(10, 10) dtype=int32>]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# now load all these arrays concurrently\n",
"await asyncio.gather(*[root.getitem(k) for k in keys])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "32c8c0ac-5be9-4dfd-b5ec-963857579bc2",
"metadata": {},
"outputs": [],
"source": [
"# we can read/write data using the asyncio interface too\n",
"arr = arrays[0]\n",
"await arr.setitem(slice(None), np.random.randint(0, 10, size=arr.shape))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "ae4a2ace-2ad3-415a-b00f-75b2b05edd2e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[4, 0, 6],\n",
" [3, 2, 2],\n",
" [8, 1, 0],\n",
" [2, 4, 4],\n",
" [0, 5, 5]], dtype=int32)"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# rather than the __getitem__ syntax, we use the getitem method\n",
"await arr.getitem((slice(5), slice(3)))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "276e8ccf-4e7b-4acb-86de-546639ea0871",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tmp3k55yj2w/\n",
"├── bar/\n",
"│ └── zarr.json\n",
"├── baz/\n",
"│ └── zarr.json\n",
"├── foo/\n",
"│ ├── c/\n",
"│ │ ├── 0/\n",
"│ │ │ ├── 0\n",
"│ │ │ └── 1\n",
"│ │ └── 1/\n",
"│ │ ├── 0\n",
"│ │ └── 1\n",
"│ └── zarr.json\n",
"├── qux/\n",
"│ └── zarr.json\n",
"├── spam/\n",
"│ └── zarr.json\n",
"└── zarr.json\n"
]
}
],
"source": [
"display_tree(store.root)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "84a13a04-3694-4685-971b-a51f0c7cfb68",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment