Last active
April 9, 2025 19:30
-
-
Save TomNicholas/1973624099b207a0550f66ed20cabc37 to your computer and use it in GitHub Desktop.
Benchmark reading netCDF4 file via HTTP GET requests from S3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.13" | |
# dependencies = [ | |
# "requests", | |
# "h5py", | |
# "xarray", | |
# "h5netcdf", | |
# ] | |
# /// | |
import time | |
import h5py | |
import requests | |
import xarray as xr | |
class HttpRangeReader: | |
"""A file-like object that uses HTTP range requests for reads and logs activity""" | |
def __init__(self, url): | |
self.url = url | |
self.position = 0 | |
self.request_count = 0 | |
self.bytes_transferred = 0 | |
self.read_operations = [] | |
self.start_time = time.time() | |
# Get file size with HEAD request | |
response = requests.head(url) | |
self.size = int(response.headers.get('Content-Length', 0)) | |
print(f"File size: {self.size/1024/1024:.2f} MB") | |
def seek(self, offset, whence=0): | |
if whence == 0: | |
self.position = offset | |
elif whence == 1: | |
self.position += offset | |
elif whence == 2: | |
self.position = self.size + offset | |
return self.position | |
def tell(self): | |
return self.position | |
def read(self, size=-1): | |
start_position = self.position | |
if size == -1: | |
# Read to end of file | |
end_position = self.size - 1 | |
size = self.size - start_position | |
else: | |
end_position = start_position + size - 1 | |
if end_position >= self.size: | |
end_position = self.size - 1 | |
size = end_position - start_position + 1 | |
if size <= 0: | |
return b'' | |
# Log this read operation | |
self.read_operations.append({ | |
'start': start_position, | |
'end': end_position, | |
'size': size, | |
'time': time.time() - self.start_time | |
}) | |
# Execute the HTTP range request | |
headers = {'Range': f'bytes={start_position}-{end_position}'} | |
response = requests.get(self.url, headers=headers) | |
self.request_count += 1 | |
self.bytes_transferred += len(response.content) | |
# Update position | |
self.position += size | |
return response.content | |
def readline(self, size=-1): | |
# Simple implementation that reads one byte at a time | |
# (inefficient, but works for demonstration) | |
line = b'' | |
while size < 0 or len(line) < size: | |
c = self.read(1) | |
if not c or c == b'\n': | |
break | |
line += c | |
return line | |
def run_netcdf_benchmark(url): | |
print(f"Starting benchmark on {url}") | |
start_time = time.time() | |
# Create HTTP range reader | |
reader = HttpRangeReader(url) | |
# Open NetCDF file using our custom reader | |
with h5py.File(reader, mode='r', driver='fileobj') as f: | |
# Just opening the dataset causes the library to read headers and metadata | |
ds = xr.open_dataset(f, engine='h5netcdf', decode_times=False) | |
print(ds) | |
end_time = time.time() | |
total_time = end_time - start_time | |
# Analyze results | |
results = { | |
'total_time': total_time, | |
'request_count': reader.request_count, | |
'bytes_transferred': reader.bytes_transferred, | |
'effective_bandwidth_kbps': (reader.bytes_transferred / 1024) / total_time, | |
'reads': reader.read_operations | |
} | |
print(f"Benchmark results:") | |
print(f"Total time: {results['total_time']:.2f} seconds") | |
print(f"Requests made: {results['request_count']}") | |
print(f"Data transferred: {results['bytes_transferred'] / 1024:.2f} KB") | |
print(f"Effective bandwidth: {results['effective_bandwidth_kbps']:.2f} KB/s") | |
# Print read pattern details | |
print("\nRead pattern analysis:") | |
sizes = [op['size'] for op in results['reads']] | |
if sizes: | |
print(f"Min read size: {min(sizes)} bytes") | |
print(f"Max read size: {max(sizes)} bytes") | |
print(f"Avg read size: {sum(sizes)/len(sizes):.2f} bytes") | |
return results | |
# URL to public S3 NetCDF file | |
url_gfs_1gb = 'https://noaa-oar-arl-nacc-pds.s3.amazonaws.com/inputs/20210323/gfs.t12z.sfcf003.nc' | |
results = run_netcdf_benchmark(url_gfs_1gb) | |
# Benchmark results: | |
# Total time: 39.54 seconds | |
# Requests made: 502 | |
# Data transferred: 267.19 KB | |
# Effective bandwidth: 6.76 KB/s | |
# Read pattern analysis: | |
# Min read size: 8 bytes | |
# Max read size: 24576 bytes | |
# Avg read size: 545.02 bytes |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.13" | |
# dependencies = [ | |
# "zarr", | |
# "xarray", | |
# "icechunk", | |
# ] | |
# /// | |
import time | |
import xarray as xr | |
import icechunk as ic | |
start_time = time.time() | |
storage = ic.s3_storage( | |
bucket="earthmover-sample-data", | |
prefix="icechunk/gfs/inputs/20210323/gfs.t12z.sfcf003", | |
region='us-east-1', | |
from_env=True, | |
) | |
repo = ic.Repository.open(storage) | |
session = repo.readonly_session("main") | |
ds = xr.open_zarr(session.store, consolidated=False, decode_times=False) | |
print(ds) | |
end_time = time.time() | |
total_time = end_time - start_time | |
print(f"Total time: {total_time:.2f} seconds") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.13" | |
# dependencies = [ | |
# "zarr", | |
# "xarray", | |
# "s3fs", | |
# "rich", | |
# ] | |
# /// | |
import time | |
import logging | |
import re | |
import xarray as xr | |
import zarr | |
import s3fs | |
consolidated = True | |
logging.basicConfig(filename=f'zarr_requests_consolidated_{consolidated}.log', level=logging.INFO) | |
s3_path = 'earthmover-sample-data/zarr/v3/gfs/inputs/20210323/gfs.t12z.sfcf003.zarr' | |
start_time = time.time() | |
fs = s3fs.S3FileSystem(anon=False) | |
fsspec_store = zarr.storage.FsspecStore(fs, path=s3_path) | |
logging_store = zarr.storage.LoggingStore(fsspec_store) | |
ds = xr.open_zarr(logging_store, consolidated=consolidated, decode_times=False) | |
print(ds) | |
end_time = time.time() | |
total_time = end_time - start_time | |
print(f"Total time: {total_time:.2f} seconds") | |
# Then parse the log file | |
get_count = 0 | |
with open(f'zarr_requests_consolidated_{consolidated}.log', 'r') as f: | |
for line in f: | |
if re.search(r'Calling FsspecStore.get', line): | |
get_count += 1 | |
print(f"Total GET requests: {get_count}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.13" | |
# dependencies = [ | |
# "pandas", | |
# "matplotlib", | |
# ] | |
# /// | |
import textwrap | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
EARTHMOVER_VIOLET = '#A653FF' | |
EARTHMOVER_LIME = '#B7E400' | |
# Increase font sizes globally | |
plt.rcParams.update({'font.size': 14}) # Sets the base font size | |
def wrap_labels(labels, width=10): | |
return [textwrap.fill(label, width) for label in labels] | |
df = pd.read_csv('results.csv') | |
wrapped_labels = wrap_labels(df['Format'], width=15) | |
# Create figure and primary axis | |
fig, ax1 = plt.subplots(figsize=(8, 6)) | |
# Set width of bars | |
bar_width = 0.35 | |
x = range(len(df['Format'])) | |
# Create bars for HTTP requests on the primary axis | |
bars1 = ax1.bar([i - bar_width/2 for i in x], df['HTTP requests'], width=bar_width, | |
color=EARTHMOVER_VIOLET, label='HTTP requests') | |
ax1.set_xlabel('Format', fontsize=16) | |
ax1.set_ylabel('HTTP requests', fontsize=16) | |
ax1.tick_params(axis='y') | |
# Create secondary axis for time | |
ax2 = ax1.twinx() | |
bars2 = ax2.bar([i + bar_width/2 for i in x], df['time (s)'], width=bar_width, | |
color=EARTHMOVER_LIME, label='time (s)') | |
ax2.set_ylabel('Time (s)', fontsize=16) | |
ax2.tick_params(axis='y') | |
# Set x-axis labels | |
plt.xticks(x, wrapped_labels, rotation=0, ha='center') | |
# Add legend | |
ax1.legend([bars1, bars2], ['HTTP requests', 'time (s)'], | |
loc='upper right', ncol=2) | |
# Add title | |
plt.title('Comparing opening different data formats on S3') | |
plt.tight_layout() | |
# Show the plot | |
plt.show() |
We can make this file beautiful and searchable if this error is corrected: It looks like row 2 should actually have 4 columns, instead of 3 in line 1.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Format,HTTP requests,time (s), | |
NetCDF (using h5netcdf as if local),502,43.28 | |
Zarr,160,1.10 | |
Zarr (consolidated metadata),7,0.5 | |
Icechunk,16,0.69 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment