Skip to content

Instantly share code, notes, and snippets.

@TomNicholas
Last active April 9, 2025 19:30
Show Gist options
  • Save TomNicholas/1973624099b207a0550f66ed20cabc37 to your computer and use it in GitHub Desktop.
Save TomNicholas/1973624099b207a0550f66ed20cabc37 to your computer and use it in GitHub Desktop.
Benchmark reading netCDF4 file via HTTP GET requests from S3
# /// script
# requires-python = ">=3.13"
# dependencies = [
# "requests",
# "h5py",
# "xarray",
# "h5netcdf",
# ]
# ///
import time
import h5py
import requests
import xarray as xr
class HttpRangeReader:
"""A file-like object that uses HTTP range requests for reads and logs activity"""
def __init__(self, url):
self.url = url
self.position = 0
self.request_count = 0
self.bytes_transferred = 0
self.read_operations = []
self.start_time = time.time()
# Get file size with HEAD request
response = requests.head(url)
self.size = int(response.headers.get('Content-Length', 0))
print(f"File size: {self.size/1024/1024:.2f} MB")
def seek(self, offset, whence=0):
if whence == 0:
self.position = offset
elif whence == 1:
self.position += offset
elif whence == 2:
self.position = self.size + offset
return self.position
def tell(self):
return self.position
def read(self, size=-1):
start_position = self.position
if size == -1:
# Read to end of file
end_position = self.size - 1
size = self.size - start_position
else:
end_position = start_position + size - 1
if end_position >= self.size:
end_position = self.size - 1
size = end_position - start_position + 1
if size <= 0:
return b''
# Log this read operation
self.read_operations.append({
'start': start_position,
'end': end_position,
'size': size,
'time': time.time() - self.start_time
})
# Execute the HTTP range request
headers = {'Range': f'bytes={start_position}-{end_position}'}
response = requests.get(self.url, headers=headers)
self.request_count += 1
self.bytes_transferred += len(response.content)
# Update position
self.position += size
return response.content
def readline(self, size=-1):
# Simple implementation that reads one byte at a time
# (inefficient, but works for demonstration)
line = b''
while size < 0 or len(line) < size:
c = self.read(1)
if not c or c == b'\n':
break
line += c
return line
def run_netcdf_benchmark(url):
print(f"Starting benchmark on {url}")
start_time = time.time()
# Create HTTP range reader
reader = HttpRangeReader(url)
# Open NetCDF file using our custom reader
with h5py.File(reader, mode='r', driver='fileobj') as f:
# Just opening the dataset causes the library to read headers and metadata
ds = xr.open_dataset(f, engine='h5netcdf', decode_times=False)
print(ds)
end_time = time.time()
total_time = end_time - start_time
# Analyze results
results = {
'total_time': total_time,
'request_count': reader.request_count,
'bytes_transferred': reader.bytes_transferred,
'effective_bandwidth_kbps': (reader.bytes_transferred / 1024) / total_time,
'reads': reader.read_operations
}
print(f"Benchmark results:")
print(f"Total time: {results['total_time']:.2f} seconds")
print(f"Requests made: {results['request_count']}")
print(f"Data transferred: {results['bytes_transferred'] / 1024:.2f} KB")
print(f"Effective bandwidth: {results['effective_bandwidth_kbps']:.2f} KB/s")
# Print read pattern details
print("\nRead pattern analysis:")
sizes = [op['size'] for op in results['reads']]
if sizes:
print(f"Min read size: {min(sizes)} bytes")
print(f"Max read size: {max(sizes)} bytes")
print(f"Avg read size: {sum(sizes)/len(sizes):.2f} bytes")
return results
# URL to public S3 NetCDF file
url_gfs_1gb = 'https://noaa-oar-arl-nacc-pds.s3.amazonaws.com/inputs/20210323/gfs.t12z.sfcf003.nc'
results = run_netcdf_benchmark(url_gfs_1gb)
# Benchmark results:
# Total time: 39.54 seconds
# Requests made: 502
# Data transferred: 267.19 KB
# Effective bandwidth: 6.76 KB/s
# Read pattern analysis:
# Min read size: 8 bytes
# Max read size: 24576 bytes
# Avg read size: 545.02 bytes
# /// script
# requires-python = ">=3.13"
# dependencies = [
# "zarr",
# "xarray",
# "icechunk",
# ]
# ///
import time
import xarray as xr
import icechunk as ic
start_time = time.time()
storage = ic.s3_storage(
bucket="earthmover-sample-data",
prefix="icechunk/gfs/inputs/20210323/gfs.t12z.sfcf003",
region='us-east-1',
from_env=True,
)
repo = ic.Repository.open(storage)
session = repo.readonly_session("main")
ds = xr.open_zarr(session.store, consolidated=False, decode_times=False)
print(ds)
end_time = time.time()
total_time = end_time - start_time
print(f"Total time: {total_time:.2f} seconds")
# /// script
# requires-python = ">=3.13"
# dependencies = [
# "zarr",
# "xarray",
# "s3fs",
# "rich",
# ]
# ///
import time
import logging
import re
import xarray as xr
import zarr
import s3fs
consolidated = True
logging.basicConfig(filename=f'zarr_requests_consolidated_{consolidated}.log', level=logging.INFO)
s3_path = 'earthmover-sample-data/zarr/v3/gfs/inputs/20210323/gfs.t12z.sfcf003.zarr'
start_time = time.time()
fs = s3fs.S3FileSystem(anon=False)
fsspec_store = zarr.storage.FsspecStore(fs, path=s3_path)
logging_store = zarr.storage.LoggingStore(fsspec_store)
ds = xr.open_zarr(logging_store, consolidated=consolidated, decode_times=False)
print(ds)
end_time = time.time()
total_time = end_time - start_time
print(f"Total time: {total_time:.2f} seconds")
# Then parse the log file
get_count = 0
with open(f'zarr_requests_consolidated_{consolidated}.log', 'r') as f:
for line in f:
if re.search(r'Calling FsspecStore.get', line):
get_count += 1
print(f"Total GET requests: {get_count}")
# /// script
# requires-python = ">=3.13"
# dependencies = [
# "pandas",
# "matplotlib",
# ]
# ///
import textwrap
import pandas as pd
import matplotlib.pyplot as plt
EARTHMOVER_VIOLET = '#A653FF'
EARTHMOVER_LIME = '#B7E400'
# Increase font sizes globally
plt.rcParams.update({'font.size': 14}) # Sets the base font size
def wrap_labels(labels, width=10):
return [textwrap.fill(label, width) for label in labels]
df = pd.read_csv('results.csv')
wrapped_labels = wrap_labels(df['Format'], width=15)
# Create figure and primary axis
fig, ax1 = plt.subplots(figsize=(8, 6))
# Set width of bars
bar_width = 0.35
x = range(len(df['Format']))
# Create bars for HTTP requests on the primary axis
bars1 = ax1.bar([i - bar_width/2 for i in x], df['HTTP requests'], width=bar_width,
color=EARTHMOVER_VIOLET, label='HTTP requests')
ax1.set_xlabel('Format', fontsize=16)
ax1.set_ylabel('HTTP requests', fontsize=16)
ax1.tick_params(axis='y')
# Create secondary axis for time
ax2 = ax1.twinx()
bars2 = ax2.bar([i + bar_width/2 for i in x], df['time (s)'], width=bar_width,
color=EARTHMOVER_LIME, label='time (s)')
ax2.set_ylabel('Time (s)', fontsize=16)
ax2.tick_params(axis='y')
# Set x-axis labels
plt.xticks(x, wrapped_labels, rotation=0, ha='center')
# Add legend
ax1.legend([bars1, bars2], ['HTTP requests', 'time (s)'],
loc='upper right', ncol=2)
# Add title
plt.title('Comparing opening different data formats on S3')
plt.tight_layout()
# Show the plot
plt.show()
We can make this file beautiful and searchable if this error is corrected: It looks like row 2 should actually have 4 columns, instead of 3 in line 1.
Format,HTTP requests,time (s),
NetCDF (using h5netcdf as if local),502,43.28
Zarr,160,1.10
Zarr (consolidated metadata),7,0.5
Icechunk,16,0.69
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment