Skip to content

Instantly share code, notes, and snippets.

@barronh
Last active December 31, 2024 21:33
Show Gist options
  • Save barronh/7acedda3ab913ca9f3efbc57400ed34e to your computer and use it in GitHub Desktop.
Save barronh/7acedda3ab913ca9f3efbc57400ed34e to your computer and use it in GitHub Desktop.
Climate Forecasting CSV
__version__ = '0.1.0'
__doc__ = """
Contents
========
df2ds : function
convert pandas.DataFrame with properties to xarray.Dataset
ds2df : function
convert xarray.Dataset to pandas.DataFrame with properties
read_csv : function
read CSV with metadata as attrs and optionally convert to Dataset
read_nc : function
read NetCDF and optionally convert to a Dataframe with properties
write_output : function
write Dataset or Dataframe to file as CSV or NetCDF
exmample_round_trip : function
Returns text generated by converting example_csv to netCDF and back
test_round_trip : function
Test all functionality (df2ds, ds2df, read_csv, read_nc, write_output)
example_csv : str
Text with an example CSV with properties.
Example
=======
The example assumes the existance of an input file example.csv. From that, it
reads the data in, converts it to a netCDF file, saves it out, reads in the
netCDF file and saves it as a CSV. The CSV content matches the original file.
open('example.csv', 'w').write(example_csv)
d1 = read_csv('example.csv', as_dataset=False)
write_output(d1, 'example.nc')
d2 = read_nc('example.nc', as_dataset=False)
write_output(d2, 'new.csv')
d3 = read_csv('new.csv', as_dataset=False)
print(open('example.csv', 'r').read() == open('new.csv', 'r').read())
# Output: True
This basic round-trip is also coded up in test_round_trip.
"""
example_csv = (
"""# title: Example YAML metadata
# institution: US EPA
# source: example v0.0
# history: '2024-12-31T14:01:00-0500: example metadata updated; """
+ """2024-12-31T12:33:00-0500:
# example created file;'
# references: 'Climate and Forecast Conventions version 1.12 DOI: 10.5281/"""
+ """zenodo.14275599'
# comment: This text could be included as a comments as the first lines of"""
+ """ a CSV that
# contains two variables called fine_particulate_matter and ozone as a """
+ """function of
# time, latitude, and longitude
# dimensions:
# - time
# - lat
# - lon
# variables:
# time:
# long_name: time
# units: seconds since 2020-01-01T00:00:00+0000
# lat:
# long_name: latitude
# units: degrees_north
# lon:
# long_name: longitude
# units: degrees_east
# fine_particulate_matter:
# long_name: fine_particulate_matter
# units: micrograms/m**3
# missing_value: -9999.0
# description: Particulate matter concentration with a diameter less"""
+ """ than 2.5 micrometers
# in units of micrograms per cubic meter.
# ozone:
# long_name: ozone
# units: ppbv
# missing_value: -9999.0
# description: Ozone mixing ratio in units of parts per billion.
# """
+ """
time,lat,lon,fine_particulate_matter,ozone
0.0,40.0,-98.0,15.0,70.0
0.0,40.0,-97.0,15.0,-9999.0
0.0,41.0,-98.0,15.0,70.0
0.0,41.0,-97.0,15.0,70.0
3600.0,40.0,-98.0,20.0,75.0
3600.0,40.0,-97.0,20.0,75.0
3600.0,41.0,-98.0,-9999.0,75.0
3600.0,41.0,-97.0,20.0,75.0
"""
)
def df2ds(df, meta=None):
"""
Arguments
---------
df : pandas.DataFrame
meta : mappable
Expected to have title, institution, source, history and comments.
Additional properties:
- dimensions (list of dimension variable names)
- variables (dictionary of variable properties: long_name, units, etc)
Returns
-------
ds : xarray.Dataset
Dataset with dimenions (dimensions or ['index']) with global and per
variable properties from meta.
"""
if meta is None:
meta = df.attrs
ds = df.to_xarray()
for varkey, varattrs in meta['variables'].items():
ds[varkey].attrs.update(varattrs)
attrs = meta.copy()
attrs.pop('dimensions', None)
attrs.pop('variables', None)
ds.attrs.update(attrs)
return ds
def ds2df(ds):
from collections import OrderedDict
meta = OrderedDict()
meta.update(ds.attrs)
meta['dimensions'] = list(ds.sizes)
meta['variables'] = {}
varkeys = list(ds.coords) + [k for k in ds.data_vars if k not in ds.coords]
for vkey in varkeys:
varo = ds[vkey]
meta['variables'][vkey] = {
pk: pv for pk, pv in varo.attrs.items() if not pk.startswith('_')
}
df = ds.to_dataframe()
df.attrs.update(meta)
return df
def read_csv(path, as_dataset=False, comment='#', **kwds):
"""
Reads CSV file and returns pandas.DataFrame with any metadata as properties
in the attrs dictionary. Optionally, converts dataframe with properties to
xarray.Dataset.
Arguments
---------
path : str
Path to CSV with or without metadata.
as_dataset : bool
If False, read CSV w/ pandas.read_csv with metadata
If True, CSV from is then converted to xarray.Dataset
kwds: mappable
Passed to pandas.read_csv as options.
Returns
-------
out : pandas.DataFrame or xarray.Dataset
If as_dataset is False, the dataframe is returned with metadata as
attributes in the attrs property -- including dimensions and variables
If as_dataset is True, the dataframe is converted to a dataset and
properties are assigned to global and variables as appropriate.
"""
import yaml
import io
import pandas as pd
kwds['comment'] = comment
metalines = []
if isinstance(path, str):
infile = open(path, mode='r')
elif isinstance(path, (io.TextIOBase, io.StringIO, io.BytesIO)):
infile = path
else:
raise TypeError('path must be str, file, io.StringIO or io.BytesIO')
while True:
_l = infile.readline()
if _l.startswith('# '):
metalines.append(_l[2:])
else:
break
infile.seek(0, 0)
df = pd.read_csv(infile, **kwds)
del infile
metatxt = ''.join(metalines)
meta = yaml.safe_load(metatxt)
dims = meta.get('dimensions', [])
if len(dims) > 0:
df.set_index(dims, inplace=True)
varmeta = meta.get('variables', {})
nan = float('nan')
for vark, varattrs in varmeta.items():
if 'missing_value' in varattrs:
df.replace({vark: varattrs['missing_value']}, nan, inplace=True)
df.attrs.update(meta)
if as_dataset is False:
return df
else:
return df2ds(df)
def read_nc(path, as_dataset=True):
"""
Arguments
---------
path : str
as_dataset : bool
If True, return xarray.Dataset
Otherwise, convert dataset to dataframe with properties a attrs
Returns
-------
out : xarray.Dataset or pandas.DataFrame
If as_dataset is True, return basic open of path.
Otherwise, use the ds2df function to convert
"""
import xarray as xr
ds = xr.open_dataset(path, decode_cf=False)
if as_dataset:
return ds
else:
return ds2df(ds)
def write_output(d, outpath):
"""
Arguments
---------
d : pandas.DataFrame or xarray.Dataset
outpath : str or buffer
If out is str and endswith csv, file will be stored as a CSV file with
metadata.
Otherwise, file will be stored as a NetCDF file with metadata.
Returns
-------
None
"""
import pandas as pd
import xarray as xr
import yaml
import numpy as np
def npfloat_representer(dumper, data):
return dumper.represent_float(data.item())
yaml.add_representer(np.float64, npfloat_representer)
if isinstance(outpath, str):
iscsv = outpath.endswith('.csv')
else:
iscsv = True
outfile = outpath
if isinstance(d, pd.DataFrame):
if iscsv:
metatxt = '\n'.join([
'# ' + _l
for _l in yaml.dump(d.attrs, sort_keys=False).split('\n')
])
if isinstance(outpath, str):
outfile = open(outpath, 'w')
else:
outfile = outpath
outfile.write(metatxt + '\n')
outfile.flush()
d.to_csv(outfile, na_rep=-9999.0)
else:
write_output(df2ds(d), outpath)
elif isinstance(d, xr.Dataset):
if iscsv:
write_output(ds2df(d), outpath)
else:
d.to_netcdf(outpath)
def exmample_round_trip():
"""
Writes example_csv to disk in temporary directory.
Reads example csv from disk.
Writes example csv to disk as NetCDF.
Reads example NetCDF and converts to new DataFrame.
Writes new data frame to disk
Compares example_csv to new text and returns result.
"""
import tempfile
with tempfile.TemporaryDirectory() as td:
with open(f'{td}/example.csv', 'w') as efile:
efile.write(example_csv)
d1 = read_csv(f'{td}/example.csv', as_dataset=False)
write_output(d1, f'{td}/example.nc')
d2 = read_nc(f'{td}/example.nc', as_dataset=False)
write_output(d2, f'{td}/new.csv')
new_csv = open(f'{td}/new.csv', 'r').read()
return new_csv
def test_round_trip():
new_csv = exmample_round_trip()
assert example_csv == new_csv
@barronh
Copy link
Author

barronh commented Dec 31, 2024

To test on Colab, run

!wget --output-document cfcsv.py https://gist.githubusercontent.com/barronh/7acedda3ab913ca9f3efbc57400ed34e/raw/cfcsv.py
!pip install coverage flake8
!coverage run -m pytest -v cfcsv.py && coverage report -m && flake8

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment