Last active
December 31, 2024 21:33
-
-
Save barronh/7acedda3ab913ca9f3efbc57400ed34e to your computer and use it in GitHub Desktop.
Climate Forecasting CSV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__version__ = '0.1.0' | |
__doc__ = """ | |
Contents | |
======== | |
df2ds : function | |
convert pandas.DataFrame with properties to xarray.Dataset | |
ds2df : function | |
convert xarray.Dataset to pandas.DataFrame with properties | |
read_csv : function | |
read CSV with metadata as attrs and optionally convert to Dataset | |
read_nc : function | |
read NetCDF and optionally convert to a Dataframe with properties | |
write_output : function | |
write Dataset or Dataframe to file as CSV or NetCDF | |
exmample_round_trip : function | |
Returns text generated by converting example_csv to netCDF and back | |
test_round_trip : function | |
Test all functionality (df2ds, ds2df, read_csv, read_nc, write_output) | |
example_csv : str | |
Text with an example CSV with properties. | |
Example | |
======= | |
The example assumes the existance of an input file example.csv. From that, it | |
reads the data in, converts it to a netCDF file, saves it out, reads in the | |
netCDF file and saves it as a CSV. The CSV content matches the original file. | |
open('example.csv', 'w').write(example_csv) | |
d1 = read_csv('example.csv', as_dataset=False) | |
write_output(d1, 'example.nc') | |
d2 = read_nc('example.nc', as_dataset=False) | |
write_output(d2, 'new.csv') | |
d3 = read_csv('new.csv', as_dataset=False) | |
print(open('example.csv', 'r').read() == open('new.csv', 'r').read()) | |
# Output: True | |
This basic round-trip is also coded up in test_round_trip. | |
""" | |
example_csv = ( | |
"""# title: Example YAML metadata | |
# institution: US EPA | |
# source: example v0.0 | |
# history: '2024-12-31T14:01:00-0500: example metadata updated; """ | |
+ """2024-12-31T12:33:00-0500: | |
# example created file;' | |
# references: 'Climate and Forecast Conventions version 1.12 DOI: 10.5281/""" | |
+ """zenodo.14275599' | |
# comment: This text could be included as a comments as the first lines of""" | |
+ """ a CSV that | |
# contains two variables called fine_particulate_matter and ozone as a """ | |
+ """function of | |
# time, latitude, and longitude | |
# dimensions: | |
# - time | |
# - lat | |
# - lon | |
# variables: | |
# time: | |
# long_name: time | |
# units: seconds since 2020-01-01T00:00:00+0000 | |
# lat: | |
# long_name: latitude | |
# units: degrees_north | |
# lon: | |
# long_name: longitude | |
# units: degrees_east | |
# fine_particulate_matter: | |
# long_name: fine_particulate_matter | |
# units: micrograms/m**3 | |
# missing_value: -9999.0 | |
# description: Particulate matter concentration with a diameter less""" | |
+ """ than 2.5 micrometers | |
# in units of micrograms per cubic meter. | |
# ozone: | |
# long_name: ozone | |
# units: ppbv | |
# missing_value: -9999.0 | |
# description: Ozone mixing ratio in units of parts per billion. | |
# """ | |
+ """ | |
time,lat,lon,fine_particulate_matter,ozone | |
0.0,40.0,-98.0,15.0,70.0 | |
0.0,40.0,-97.0,15.0,-9999.0 | |
0.0,41.0,-98.0,15.0,70.0 | |
0.0,41.0,-97.0,15.0,70.0 | |
3600.0,40.0,-98.0,20.0,75.0 | |
3600.0,40.0,-97.0,20.0,75.0 | |
3600.0,41.0,-98.0,-9999.0,75.0 | |
3600.0,41.0,-97.0,20.0,75.0 | |
""" | |
) | |
def df2ds(df, meta=None): | |
""" | |
Arguments | |
--------- | |
df : pandas.DataFrame | |
meta : mappable | |
Expected to have title, institution, source, history and comments. | |
Additional properties: | |
- dimensions (list of dimension variable names) | |
- variables (dictionary of variable properties: long_name, units, etc) | |
Returns | |
------- | |
ds : xarray.Dataset | |
Dataset with dimenions (dimensions or ['index']) with global and per | |
variable properties from meta. | |
""" | |
if meta is None: | |
meta = df.attrs | |
ds = df.to_xarray() | |
for varkey, varattrs in meta['variables'].items(): | |
ds[varkey].attrs.update(varattrs) | |
attrs = meta.copy() | |
attrs.pop('dimensions', None) | |
attrs.pop('variables', None) | |
ds.attrs.update(attrs) | |
return ds | |
def ds2df(ds): | |
from collections import OrderedDict | |
meta = OrderedDict() | |
meta.update(ds.attrs) | |
meta['dimensions'] = list(ds.sizes) | |
meta['variables'] = {} | |
varkeys = list(ds.coords) + [k for k in ds.data_vars if k not in ds.coords] | |
for vkey in varkeys: | |
varo = ds[vkey] | |
meta['variables'][vkey] = { | |
pk: pv for pk, pv in varo.attrs.items() if not pk.startswith('_') | |
} | |
df = ds.to_dataframe() | |
df.attrs.update(meta) | |
return df | |
def read_csv(path, as_dataset=False, comment='#', **kwds): | |
""" | |
Reads CSV file and returns pandas.DataFrame with any metadata as properties | |
in the attrs dictionary. Optionally, converts dataframe with properties to | |
xarray.Dataset. | |
Arguments | |
--------- | |
path : str | |
Path to CSV with or without metadata. | |
as_dataset : bool | |
If False, read CSV w/ pandas.read_csv with metadata | |
If True, CSV from is then converted to xarray.Dataset | |
kwds: mappable | |
Passed to pandas.read_csv as options. | |
Returns | |
------- | |
out : pandas.DataFrame or xarray.Dataset | |
If as_dataset is False, the dataframe is returned with metadata as | |
attributes in the attrs property -- including dimensions and variables | |
If as_dataset is True, the dataframe is converted to a dataset and | |
properties are assigned to global and variables as appropriate. | |
""" | |
import yaml | |
import io | |
import pandas as pd | |
kwds['comment'] = comment | |
metalines = [] | |
if isinstance(path, str): | |
infile = open(path, mode='r') | |
elif isinstance(path, (io.TextIOBase, io.StringIO, io.BytesIO)): | |
infile = path | |
else: | |
raise TypeError('path must be str, file, io.StringIO or io.BytesIO') | |
while True: | |
_l = infile.readline() | |
if _l.startswith('# '): | |
metalines.append(_l[2:]) | |
else: | |
break | |
infile.seek(0, 0) | |
df = pd.read_csv(infile, **kwds) | |
del infile | |
metatxt = ''.join(metalines) | |
meta = yaml.safe_load(metatxt) | |
dims = meta.get('dimensions', []) | |
if len(dims) > 0: | |
df.set_index(dims, inplace=True) | |
varmeta = meta.get('variables', {}) | |
nan = float('nan') | |
for vark, varattrs in varmeta.items(): | |
if 'missing_value' in varattrs: | |
df.replace({vark: varattrs['missing_value']}, nan, inplace=True) | |
df.attrs.update(meta) | |
if as_dataset is False: | |
return df | |
else: | |
return df2ds(df) | |
def read_nc(path, as_dataset=True): | |
""" | |
Arguments | |
--------- | |
path : str | |
as_dataset : bool | |
If True, return xarray.Dataset | |
Otherwise, convert dataset to dataframe with properties a attrs | |
Returns | |
------- | |
out : xarray.Dataset or pandas.DataFrame | |
If as_dataset is True, return basic open of path. | |
Otherwise, use the ds2df function to convert | |
""" | |
import xarray as xr | |
ds = xr.open_dataset(path, decode_cf=False) | |
if as_dataset: | |
return ds | |
else: | |
return ds2df(ds) | |
def write_output(d, outpath): | |
""" | |
Arguments | |
--------- | |
d : pandas.DataFrame or xarray.Dataset | |
outpath : str or buffer | |
If out is str and endswith csv, file will be stored as a CSV file with | |
metadata. | |
Otherwise, file will be stored as a NetCDF file with metadata. | |
Returns | |
------- | |
None | |
""" | |
import pandas as pd | |
import xarray as xr | |
import yaml | |
import numpy as np | |
def npfloat_representer(dumper, data): | |
return dumper.represent_float(data.item()) | |
yaml.add_representer(np.float64, npfloat_representer) | |
if isinstance(outpath, str): | |
iscsv = outpath.endswith('.csv') | |
else: | |
iscsv = True | |
outfile = outpath | |
if isinstance(d, pd.DataFrame): | |
if iscsv: | |
metatxt = '\n'.join([ | |
'# ' + _l | |
for _l in yaml.dump(d.attrs, sort_keys=False).split('\n') | |
]) | |
if isinstance(outpath, str): | |
outfile = open(outpath, 'w') | |
else: | |
outfile = outpath | |
outfile.write(metatxt + '\n') | |
outfile.flush() | |
d.to_csv(outfile, na_rep=-9999.0) | |
else: | |
write_output(df2ds(d), outpath) | |
elif isinstance(d, xr.Dataset): | |
if iscsv: | |
write_output(ds2df(d), outpath) | |
else: | |
d.to_netcdf(outpath) | |
def exmample_round_trip(): | |
""" | |
Writes example_csv to disk in temporary directory. | |
Reads example csv from disk. | |
Writes example csv to disk as NetCDF. | |
Reads example NetCDF and converts to new DataFrame. | |
Writes new data frame to disk | |
Compares example_csv to new text and returns result. | |
""" | |
import tempfile | |
with tempfile.TemporaryDirectory() as td: | |
with open(f'{td}/example.csv', 'w') as efile: | |
efile.write(example_csv) | |
d1 = read_csv(f'{td}/example.csv', as_dataset=False) | |
write_output(d1, f'{td}/example.nc') | |
d2 = read_nc(f'{td}/example.nc', as_dataset=False) | |
write_output(d2, f'{td}/new.csv') | |
new_csv = open(f'{td}/new.csv', 'r').read() | |
return new_csv | |
def test_round_trip(): | |
new_csv = exmample_round_trip() | |
assert example_csv == new_csv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
To test on Colab, run