Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save NikosAlexandris/2ea12e74cc72f7eb1a326e8b9b903aef to your computer and use it in GitHub Desktop.
Save NikosAlexandris/2ea12e74cc72f7eb1a326e8b9b903aef to your computer and use it in GitHub Desktop.
Checking chunk sizes [DRAFT]
@app.command(
'check-chunks',
no_args_is_help=True,
help='Check for chunk size consistency along series of files in a format supported by Xarray',
)
def check_chunk_consistency(
source_directory: Annotated[Path, typer_argument_source_directory],
pattern: Annotated[str, typer_option_filename_pattern] = "*.json",
verbose: Annotated[int, typer.Option(..., "--verbose", "-v", count=True, help="Increase verbosity level.")] = 0,
):
""" """
source_directory = Path(source_directory)
file_paths = list(source_directory.glob(pattern))
files = list(map(str, file_paths))
chunk_sizes = {} # dictionary to store chunk sizes of first file
for file in files:
with xr.open_dataset(file, engine="netcdf4") as dataset:
if not chunk_sizes: # populate with chunk sizes
for variable in dataset.variables:
if dataset[variable].encoding.get("chunksizes"):
chunk_sizes[variable] = dataset[variable].encoding["chunksizes"]
logger.debug(f'File : {file}, Chunks : {chunk_sizes}')
else:
# For subsequent files, check if chunk sizes match the initial ones
for variable in dataset.variables:
if (
dataset[variable].encoding.get("chunksizes")
and chunk_sizes.get(variable)
!= dataset[variable].encoding["chunksizes"]
):
raise ValueError(
f"Chunk size mismatch in file '{file}' for variable '{variable}'. Expected {chunk_sizes[variable]} but got {dataset[variable].encoding['chunksizes']}"
)
else:
logger.debug(f'Variable : {variable}, Chunks : {dataset[variable].encoding["chunksizes"]}')
print("All files have consistent chunk sizes!")
import json
import logging
def get_chunk_sizes_from_json(file_path, variable):
try:
with open(file_path, 'r') as f:
data = json.load(f)
json_string = data['refs'].get(f'{variable}/.zarray')
if not json_string:
logger.warning(f"'{variable}/.zarray' not found in file {file_path}. Skipping...")
return {}
chunks_string = json.loads(json_string)
chunk_sizes = {variable: chunks_string.get("chunks")}
logger.info(f'File : {file_path}, Variable: {variable}, Chunk sizes: {chunk_sizes}')
return chunk_sizes
except Exception as e:
logger.error(f"Error processing file {file_path}: {e}")
return {}
def compare_chunk_sizes_json(file, variable, initial_chunk_sizes):
logger.info(f'Comparing file {file}')
current_chunk_sizes = get_chunk_sizes_from_json(file, variable)
mismatched_vars = [(variable, size) for variable, size in current_chunk_sizes.items() if initial_chunk_sizes.get(variable) != size]
if mismatched_vars:
var, size = mismatched_vars[0]
expected_size = initial_chunk_sizes[var]
logger.error(f"Chunk size mismatch in file {file} for variable {var}. Expected {expected_size} but got {size}")
return False
else:
# logger.info('Chunk sizes match!')
return True
@app.command(
'check-chunks-json',
no_args_is_help=True,
help='Check for chunk size consistency along series of kerchunk reference files',
)
def validate_chunk_sizes(
source_directory: Annotated[Path, typer_argument_source_directory],
variable: Annotated[str, typer.Argument(..., help='Variable name to select from')],
pattern: Annotated[str, typer_option_filename_pattern] = "*.json",
):
source_directory = Path(source_directory)
file_paths = list(source_directory.glob(pattern))
files = list(map(str, file_paths))
# Use as a comparison reference the chunk sizes from the first file
initial_chunk_sizes = get_chunk_sizes_from_json(files[0], variable)
if not initial_chunk_sizes:
logger.error(f"Cannot read chunk sizes from initial file {files[0]}. Exiting...")
return
all_match = True
for file in files[1:]:
if not compare_chunk_sizes_json(file, variable, initial_chunk_sizes):
all_match = False
if all_match:
logger.info("All files have consistent chunk sizes!")
print("All files have consistent chunk sizes!")
else:
logger.warning("Some files have inconsistent chunk sizes. Check the logs for details.")
print("Some files have inconsistent chunk sizes. Check the logs for details.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment