Skip to content

Instantly share code, notes, and snippets.

@d70-t
Last active May 17, 2022 15:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save d70-t/52bc0ecfa0d8bffec3c0da620b03891f to your computer and use it in GitHub Desktop.
Save d70-t/52bc0ecfa0d8bffec3c0da620b03891f to your computer and use it in GitHub Desktop.
renumber shards
import re
from typing import Dict, Optional, Sequence
import zarr.storage
class RenumberShardsStore(zarr.storage.Store):
def __init__(
self,
base: zarr.storage.BaseStore,
shards: Dict[str, Sequence[int]],
dimension_separator: Optional[str] = None,
):
self.base = base
self.shards = shards
self.dimension_separator = dimension_separator or "."
self.shard_matchers = [
(
re.compile(re.escape(k + "/") + self._chunk_re(len(v))),
re.compile(re.escape(k + "/") + self._chunk_re(len(v)) + re.escape("/") + self._chunk_re(len(v))),
k,
v,
) for k, v in self.shards.items()]
def _chunk_re(self, n):
return self.dimension_separator.join(["([0-9]+)"] * n)
def _renumber_key(self, key):
for shard_re, _, k, shard_shape in self.shard_matchers:
if m := shard_re.match(key):
chunk_index = [int(s) for s in m.groups()]
shard_index, subchunk_index = zip(*((c // s, c % s) for c, s in zip(chunk_index, shard_shape)))
return k + "/" + self.dimension_separator.join(map(str, shard_index)) + "/" + self.dimension_separator.join(map(str, subchunk_index))
return key
def _inverse_renumber_key(self, key):
for _, shard_re, k, shard_shape in self.shard_matchers:
if m := shard_re.match(key):
indices = [int(s) for s in m.groups()]
shard_index, subchunk_index = indices[:len(shard_shape)], indices[len(shard_shape):]
chunk_index = [s * ss + c for ss, s, c in zip(shard_shape, shard_index, subchunk_index)]
return k + "/" + self.dimension_separator.join(map(str, chunk_index))
return key
def __delitem__(self, key):
del self.base[self._renumber_key(key)]
def __getitem__(self, key):
return self.base[self._renumber_key(key)]
def __setitem__(self, key, value):
self.base[self._renumber_key(key)] = value
def __iter__(self):
for k in self.base:
yield self._inverse_renumber_key(k)
def __len__(self):
return len(self.base)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "sophisticated-muscle",
"metadata": {},
"outputs": [],
"source": [
"import zarr.storage\n",
"import renumberingstore"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "corrected-archive",
"metadata": {},
"outputs": [],
"source": [
"base_store = zarr.storage.MemoryStore()\n",
"renumber_store = renumberingstore.RenumberShardsStore(base_store, {\"a\": (3,2)})"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "adjacent-montreal",
"metadata": {},
"outputs": [],
"source": [
"import xarray as xr\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "bound-freeware",
"metadata": {},
"outputs": [],
"source": [
"ds = xr.Dataset({\n",
" \"a\": ((\"x\", \"y\"), np.random.random((30, 8))),\n",
"}).chunk({\"x\": 5, \"y\": 4})"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "outdoor-twelve",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<xarray.backends.zarr.ZarrStore at 0x12c355d60>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ds.to_zarr(renumber_store)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "novel-electronics",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['.zgroup',\n",
" '.zattrs',\n",
" 'a/.zarray',\n",
" 'a/.zattrs',\n",
" 'a/0.0',\n",
" 'a/1.0',\n",
" 'a/0.1',\n",
" 'a/2.0',\n",
" 'a/2.1',\n",
" 'a/1.1',\n",
" 'a/3.0',\n",
" 'a/3.1',\n",
" 'a/4.1',\n",
" 'a/5.0',\n",
" 'a/4.0',\n",
" 'a/5.1',\n",
" '.zmetadata']"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(renumber_store)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "choice-visit",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['.zgroup',\n",
" '.zattrs',\n",
" 'a/.zarray',\n",
" 'a/.zattrs',\n",
" 'a/0.0/0.0',\n",
" 'a/0.0/1.0',\n",
" 'a/0.0/0.1',\n",
" 'a/0.0/2.0',\n",
" 'a/0.0/2.1',\n",
" 'a/0.0/1.1',\n",
" 'a/1.0/0.0',\n",
" 'a/1.0/0.1',\n",
" 'a/1.0/1.1',\n",
" 'a/1.0/2.0',\n",
" 'a/1.0/1.0',\n",
" 'a/1.0/2.1',\n",
" '.zmetadata']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(base_store)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "pleasant-diagram",
"metadata": {},
"outputs": [],
"source": [
"import shardedstore\n",
"base_store = zarr.storage.MemoryStore()\n",
"shard1 = zarr.storage.MemoryStore()\n",
"shard2 = zarr.storage.MemoryStore()\n",
"sharded_store = shardedstore.ShardedStore(base_store,\n",
" {'a/0.0': shard1, 'a/1.0': shard2})\n",
"renumber_sharded_store = renumberingstore.RenumberShardsStore(sharded_store, {\"a\": (3,2)})"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "occupational-finish",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<xarray.backends.zarr.ZarrStore at 0x12c38a580>"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ds.to_zarr(renumber_sharded_store)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "acting-brake",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['.zgroup', '.zattrs', 'a/.zarray', 'a/.zattrs', '.zmetadata']"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(base_store)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "recorded-lodge",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['0.1', '1.0', '1.1', '0.0', '2.0', '2.1']"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(shard1)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "rational-metro",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['0.0', '0.1', '1.1', '2.0', '1.0', '2.1']"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(shard2)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment