lbesnard/issue.py

## issue.py
import s3fs
import xarray as xr

s3 = s3fs.S3FileSystem(anon=True)

# This generates a list of strings with filenames
s3path = 's3://imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/dn/2023/*'
remote_files = s3.glob(s3path)

# create a fileset
fileset = [s3.open(file) for file in remote_files]

# locate batch of files with issue
subset = fileset[170:180]

# ISSUE is with file 0, when added to another file
subset[0:2]

# opening those 2 files together consumes a lot more memory than other files
ds = xr.open_mfdataset(
                subset[0:2],
                engine='h5netcdf',
                concat_characters=True,
                mask_and_scale=True,
                decode_cf=True,
                decode_times=True,
                use_cftime=True,
                parallel=True,
                decode_coords=True,
            )

output_file = '/tmp/dataset.nc'

# Save the dataset to NetCDF file, but consumes all memory and dies
ds.to_netcdf(output_file,
            mode='w')

# on the coiled cluster, because of this file, my dask graph is all red, and fails.
# I cant find what is wrong with this file and how to handle it/ check the files before having them in mfdataset
	import s3fs
	import xarray as xr

	s3 = s3fs.S3FileSystem(anon=True)

	# This generates a list of strings with filenames
	s3path = 's3://imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/dn/2023/*'
	remote_files = s3.glob(s3path)

	# create a fileset
	fileset = [s3.open(file) for file in remote_files]

	# locate batch of files with issue
	subset = fileset[170:180]

	# ISSUE is with file 0, when added to another file
	subset[0:2]

	# opening those 2 files together consumes a lot more memory than other files
	ds = xr.open_mfdataset(
	subset[0:2],
	engine='h5netcdf',
	concat_characters=True,
	mask_and_scale=True,
	decode_cf=True,
	decode_times=True,
	use_cftime=True,
	parallel=True,
	decode_coords=True,
	)

	output_file = '/tmp/dataset.nc'

	# Save the dataset to NetCDF file, but consumes all memory and dies
	ds.to_netcdf(output_file,
	mode='w')

	# on the coiled cluster, because of this file, my dask graph is all red, and fails.
	# I cant find what is wrong with this file and how to handle it/ check the files before having them in mfdataset