TomNicholas/benchmark_hdf.py

## benchmark_hdf.py
# /// script
# requires-python = ">=3.13"
# dependencies = [
#     "requests",
#     "h5py",
#     "xarray",
#     "h5netcdf",
# ]
# ///

import time

import h5py
import requests
import xarray as xr


class HttpRangeReader:
    """A file-like object that uses HTTP range requests for reads and logs activity"""

    def __init__(self, url):
        self.url = url
        self.position = 0
        self.request_count = 0
        self.bytes_transferred = 0
        self.read_operations = []
        self.start_time = time.time()

        # Get file size with HEAD request
        response = requests.head(url)
        self.size = int(response.headers.get('Content-Length', 0))
        print(f"File size: {self.size/1024/1024:.2f} MB")

    def seek(self, offset, whence=0):
        if whence == 0:
            self.position = offset
        elif whence == 1:
            self.position += offset
        elif whence == 2:
            self.position = self.size + offset
        return self.position

    def tell(self):
        return self.position

    def read(self, size=-1):
        start_position = self.position

        if size == -1:
            # Read to end of file
            end_position = self.size - 1
            size = self.size - start_position
        else:
            end_position = start_position + size - 1

        if end_position >= self.size:
            end_position = self.size - 1
            size = end_position - start_position + 1

        if size <= 0:
            return b''

        # Log this read operation
        self.read_operations.append({
            'start': start_position,
            'end': end_position,
            'size': size,
            'time': time.time() - self.start_time
        })

        # Execute the HTTP range request
        headers = {'Range': f'bytes={start_position}-{end_position}'}
        response = requests.get(self.url, headers=headers)
        self.request_count += 1
        self.bytes_transferred += len(response.content)

        # Update position
        self.position += size

        return response.content

    def readline(self, size=-1):
        # Simple implementation that reads one byte at a time
        # (inefficient, but works for demonstration)
        line = b''
        while size < 0 or len(line) < size:
            c = self.read(1)
            if not c or c == b'\n':
                break
            line += c
        return line


def run_netcdf_benchmark(url):
    print(f"Starting benchmark on {url}")
    start_time = time.time()

    # Create HTTP range reader
    reader = HttpRangeReader(url)

    # Open NetCDF file using our custom reader
    with h5py.File(reader, mode='r', driver='fileobj') as f:
        # Just opening the dataset causes the library to read headers and metadata
        ds = xr.open_dataset(f, engine='h5netcdf', decode_times=False)
        print(ds)

    end_time = time.time()
    total_time = end_time - start_time

    # Analyze results
    results = {
        'total_time': total_time,
        'request_count': reader.request_count,
        'bytes_transferred': reader.bytes_transferred,
        'effective_bandwidth_kbps': (reader.bytes_transferred / 1024) / total_time,
        'reads': reader.read_operations
    }

    print(f"Benchmark results:")
    print(f"Total time: {results['total_time']:.2f} seconds")
    print(f"Requests made: {results['request_count']}")
    print(f"Data transferred: {results['bytes_transferred'] / 1024:.2f} KB")
    print(f"Effective bandwidth: {results['effective_bandwidth_kbps']:.2f} KB/s")

    # Print read pattern details
    print("\nRead pattern analysis:")
    sizes = [op['size'] for op in results['reads']]
    if sizes:
        print(f"Min read size: {min(sizes)} bytes")
        print(f"Max read size: {max(sizes)} bytes")
        print(f"Avg read size: {sum(sizes)/len(sizes):.2f} bytes")

    return results


# URL to public S3 NetCDF file
url_gfs_1gb = 'https://noaa-oar-arl-nacc-pds.s3.amazonaws.com/inputs/20210323/gfs.t12z.sfcf003.nc'
results = run_netcdf_benchmark(url_gfs_1gb)


# Benchmark results:
# Total time: 39.54 seconds
# Requests made: 502
# Data transferred: 267.19 KB
# Effective bandwidth: 6.76 KB/s

# Read pattern analysis:
# Min read size: 8 bytes
# Max read size: 24576 bytes
# Avg read size: 545.02 bytes

## benchmark_icechunk.py
# /// script
# requires-python = ">=3.13"
# dependencies = [
#     "zarr",
#     "xarray",
#     "icechunk",
# ]
# ///

import time

import xarray as xr
import icechunk as ic


start_time = time.time()

storage = ic.s3_storage(
    bucket="earthmover-sample-data",
    prefix="icechunk/gfs/inputs/20210323/gfs.t12z.sfcf003",
    region='us-east-1',
    from_env=True,
)
repo = ic.Repository.open(storage)
session = repo.readonly_session("main")

ds = xr.open_zarr(session.store, consolidated=False, decode_times=False)
print(ds)

end_time = time.time()
total_time = end_time - start_time

print(f"Total time: {total_time:.2f} seconds")

## benchmark_zarr.py
# /// script
# requires-python = ">=3.13"
# dependencies = [
#     "zarr",
#     "xarray",
#     "s3fs",
#     "rich",
# ]
# ///

import time
import logging
import re

import xarray as xr
import zarr
import s3fs


consolidated = True


logging.basicConfig(filename=f'zarr_requests_consolidated_{consolidated}.log', level=logging.INFO)


s3_path = 'earthmover-sample-data/zarr/v3/gfs/inputs/20210323/gfs.t12z.sfcf003.zarr'


start_time = time.time()

fs = s3fs.S3FileSystem(anon=False)
fsspec_store = zarr.storage.FsspecStore(fs, path=s3_path)
logging_store = zarr.storage.LoggingStore(fsspec_store)

ds = xr.open_zarr(logging_store, consolidated=consolidated, decode_times=False)
print(ds)

end_time = time.time()
total_time = end_time - start_time

print(f"Total time: {total_time:.2f} seconds")

# Then parse the log file
get_count = 0
with open(f'zarr_requests_consolidated_{consolidated}.log', 'r') as f:
    for line in f:
        if re.search(r'Calling FsspecStore.get', line):
            get_count += 1

print(f"Total GET requests: {get_count}")

## plot.py
# /// script
# requires-python = ">=3.13"
# dependencies = [
#     "pandas",
#     "matplotlib",
# ]
# ///

import textwrap

import pandas as pd
import matplotlib.pyplot as plt

EARTHMOVER_VIOLET = '#A653FF'
EARTHMOVER_LIME = '#B7E400'

# Increase font sizes globally
plt.rcParams.update({'font.size': 14})  # Sets the base font size

def wrap_labels(labels, width=10):
    return [textwrap.fill(label, width) for label in labels]

df = pd.read_csv('results.csv')

wrapped_labels = wrap_labels(df['Format'], width=15)

# Create figure and primary axis
fig, ax1 = plt.subplots(figsize=(8, 6))

# Set width of bars
bar_width = 0.35
x = range(len(df['Format']))

# Create bars for HTTP requests on the primary axis
bars1 = ax1.bar([i - bar_width/2 for i in x], df['HTTP requests'], width=bar_width,
                color=EARTHMOVER_VIOLET, label='HTTP requests')
ax1.set_xlabel('Format', fontsize=16)
ax1.set_ylabel('HTTP requests', fontsize=16)
ax1.tick_params(axis='y')

# Create secondary axis for time
ax2 = ax1.twinx()
bars2 = ax2.bar([i + bar_width/2 for i in x], df['time (s)'], width=bar_width,
                color=EARTHMOVER_LIME, label='time (s)')
ax2.set_ylabel('Time (s)', fontsize=16)
ax2.tick_params(axis='y')

# Set x-axis labels
plt.xticks(x, wrapped_labels, rotation=0, ha='center')

# Add legend

ax1.legend([bars1, bars2], ['HTTP requests', 'time (s)'],
          loc='upper right', ncol=2)

# Add title
plt.title('Comparing opening different data formats on S3')

plt.tight_layout()

# Show the plot
plt.show()

## results.csv
Format,HTTP requests,time (s),
NetCDF (using h5netcdf as if local),502,43.28
Zarr,160,1.10
Zarr (consolidated metadata),7,0.5
Icechunk,16,0.69
	# /// script
	# requires-python = ">=3.13"
	# dependencies = [
	# "requests",
	# "h5py",
	# "xarray",
	# "h5netcdf",
	# ]
	# ///

	import time

	import h5py
	import requests
	import xarray as xr


	class HttpRangeReader:
	"""A file-like object that uses HTTP range requests for reads and logs activity"""

	def __init__(self, url):
	self.url = url
	self.position = 0
	self.request_count = 0
	self.bytes_transferred = 0
	self.read_operations = []
	self.start_time = time.time()

	# Get file size with HEAD request
	response = requests.head(url)
	self.size = int(response.headers.get('Content-Length', 0))
	print(f"File size: {self.size/1024/1024:.2f} MB")

	def seek(self, offset, whence=0):
	if whence == 0:
	self.position = offset
	elif whence == 1:
	self.position += offset
	elif whence == 2:
	self.position = self.size + offset
	return self.position

	def tell(self):
	return self.position

	def read(self, size=-1):
	start_position = self.position

	if size == -1:
	# Read to end of file
	end_position = self.size - 1
	size = self.size - start_position
	else:
	end_position = start_position + size - 1

	if end_position >= self.size:
	end_position = self.size - 1
	size = end_position - start_position + 1

	if size <= 0:
	return b''

	# Log this read operation
	self.read_operations.append({
	'start': start_position,
	'end': end_position,
	'size': size,
	'time': time.time() - self.start_time
	})

	# Execute the HTTP range request
	headers = {'Range': f'bytes={start_position}-{end_position}'}
	response = requests.get(self.url, headers=headers)
	self.request_count += 1
	self.bytes_transferred += len(response.content)

	# Update position
	self.position += size

	return response.content

	def readline(self, size=-1):
	# Simple implementation that reads one byte at a time
	# (inefficient, but works for demonstration)
	line = b''
	while size < 0 or len(line) < size:
	c = self.read(1)
	if not c or c == b'\n':
	break
	line += c
	return line


	def run_netcdf_benchmark(url):
	print(f"Starting benchmark on {url}")
	start_time = time.time()

	# Create HTTP range reader
	reader = HttpRangeReader(url)

	# Open NetCDF file using our custom reader
	with h5py.File(reader, mode='r', driver='fileobj') as f:
	# Just opening the dataset causes the library to read headers and metadata
	ds = xr.open_dataset(f, engine='h5netcdf', decode_times=False)
	print(ds)

	end_time = time.time()
	total_time = end_time - start_time

	# Analyze results
	results = {
	'total_time': total_time,
	'request_count': reader.request_count,
	'bytes_transferred': reader.bytes_transferred,
	'effective_bandwidth_kbps': (reader.bytes_transferred / 1024) / total_time,
	'reads': reader.read_operations
	}

	print(f"Benchmark results:")
	print(f"Total time: {results['total_time']:.2f} seconds")
	print(f"Requests made: {results['request_count']}")
	print(f"Data transferred: {results['bytes_transferred'] / 1024:.2f} KB")
	print(f"Effective bandwidth: {results['effective_bandwidth_kbps']:.2f} KB/s")

	# Print read pattern details
	print("\nRead pattern analysis:")
	sizes = [op['size'] for op in results['reads']]
	if sizes:
	print(f"Min read size: {min(sizes)} bytes")
	print(f"Max read size: {max(sizes)} bytes")
	print(f"Avg read size: {sum(sizes)/len(sizes):.2f} bytes")

	return results


	# URL to public S3 NetCDF file
	url_gfs_1gb = 'https://noaa-oar-arl-nacc-pds.s3.amazonaws.com/inputs/20210323/gfs.t12z.sfcf003.nc'
	results = run_netcdf_benchmark(url_gfs_1gb)


	# Benchmark results:
	# Total time: 39.54 seconds
	# Requests made: 502
	# Data transferred: 267.19 KB
	# Effective bandwidth: 6.76 KB/s

	# Read pattern analysis:
	# Min read size: 8 bytes
	# Max read size: 24576 bytes
	# Avg read size: 545.02 bytes
	Format,HTTP requests,time (s),
	NetCDF (using h5netcdf as if local),502,43.28
	Zarr,160,1.10
	Zarr (consolidated metadata),7,0.5
	Icechunk,16,0.69