Skip to content

Instantly share code, notes, and snippets.

@philippemiron
Last active March 24, 2022 19:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save philippemiron/c6f1509568a01d61d4bbaf16153ffa2a to your computer and use it in GitHub Desktop.
Save philippemiron/c6f1509568a01d61d4bbaf16153ffa2a to your computer and use it in GitHub Desktop.
preprocess for gdp stage-recipe
import numpy as np
import pandas as pd
from datetime import datetime
def decode_date(t):
'''
The date format is specified in 'seconds since 1970-01-01 00:00:00' but the missing values
are stored as -1e+34 which is not supported by the default parsing mechanism in xarray
This function returns replaced the missing valye by NaT and return a datetime object.
:param t: date
:return: datetime object
'''
nat_index = np.logical_or(np.isclose(t, -1e+34), np.isnan(t))
t[nat_index] = np.nan
return np.array(pd.to_datetime(t, unit='s', origin='unix'))
def fill_values(var, default=np.nan):
'''
Change fill values (-1e+34, inf, -inf) in var array to value specifed by default
'''
missing_value = np.logical_or(np.isclose(var, -1e+34), ~np.isfinite(var))
if np.any(missing_value):
var[missing_value] = default
return var
def str_to_float(value, default=np.nan):
'''
:param value: string
:return: bool
'''
try:
fvalue = float(value)
if np.isnan(fvalue):
return default
else:
return fvalue
except ValueError:
return default
def cut_str(value, max_length):
'''
Cut a string to a specific length.
:param value: string
max_length: length of the output
:return: string with max_length chars
'''
charar = np.chararray(1, max_length)
charar[:max_length] = value
return charar
def drogue_presence(lost_time, time):
'''
Create drogue status from the drogue lost time and the trajectory time
:params lost_time: timestamp of the drogue loss (or NaT)
time[obs]: observation time
:return: bool[obs]: 1 drogued, 0 undrogued
'''
if pd.isnull(lost_time) or lost_time >= time[-1]:
return np.ones_like(time, dtype='bool')
else:
return time < lost_time
def preprocess_gdp(ds):
# parse the date with custom function
ds.update({'deploy_date': (['traj'], decode_date(np.array([ds.deploy_date.data[0]])))})
ds.update({'end_date': (['traj'], decode_date(np.array([ds.end_date.data[0]])))})
ds.update({'drogue_lost_date': (['traj'], decode_date(np.array([ds.drogue_lost_date.data[0]])))})
ds.update({'time': (['obs'], decode_date(ds.time.data[0]))})
# convert fill values to nan
ds['sst'].data = fill_values(ds['sst'].data)
ds['sst1'].data = fill_values(ds['sst1'].data)
ds['sst2'].data = fill_values(ds['sst2'].data)
ds['err_sst'].data = fill_values(ds['err_sst'].data)
ds['err_sst1'].data = fill_values(ds['err_sst1'].data)
ds['err_sst2'].data = fill_values(ds['err_sst2'].data)
# convert attributes to variable
ds['location_type'] = (('traj'), [False if ds.location_type == 'Argos' else True]) # 0 for Argos, 1 for GPS
ds['DeployingShip'] = (('traj'), cut_str(ds.DeployingShip, 15))
ds['DeploymentStatus'] = (('traj'), cut_str(ds.DeploymentStatus, 15))
ds['BuoyTypeManufacturer'] = (('traj'), cut_str(ds.BuoyTypeManufacturer, 15))
ds['BuoyTypeSensorArray'] = (('traj'), cut_str(ds.BuoyTypeSensorArray, 15))
ds['CurrentProgram'] = (('traj'), np.int32([str_to_float(ds.CurrentProgram, -1)]))
ds['PurchaserFunding'] = (('traj'), cut_str(ds.PurchaserFunding, 15))
ds['SensorUpgrade'] = (('traj'), cut_str(ds.SensorUpgrade, 15))
ds['Transmissions'] = (('traj'), cut_str(ds.Transmissions, 15))
ds['DeployingCountry'] = (('traj'), cut_str(ds.DeployingCountry, 15))
ds['DeploymentComments'] = (('traj'), cut_str(ds.DeploymentComments, 15))
ds['ManufactureYear'] = (('traj'), np.int16([str_to_float(ds.ManufactureYear, -1)]))
ds['ManufactureMonth'] = (('traj'), np.int16([str_to_float(ds.ManufactureMonth, -1)]))
ds['ManufactureSensorType'] = (('traj'), cut_str(ds.ManufactureSensorType, 15))
ds['ManufactureVoltage'] = (('traj'), np.int16([str_to_float(ds.ManufactureVoltage[:-6], -1)])) # e.g. 56 V
ds['FloatDiameter'] = (('traj'), [str_to_float(ds.FloatDiameter[:-3])]) # e.g. 35.5 cm
ds['SubsfcFloatPresence'] = (('traj'), np.array([str_to_float(ds.SubsfcFloatPresence)], dtype='bool'))
ds['DrogueType'] = (('traj'), cut_str(ds.DrogueType, 7))
ds['DrogueLength'] = (('traj'), [str_to_float(ds.DrogueLength[:-2])]) # e.g. 4.8 m
ds['DrogueBallast'] = (('traj'), [str_to_float(ds.DrogueBallast[:-3])]) # e.g. 1.4 kg
ds['DragAreaAboveDrogue'] = (('traj'), [str_to_float(ds.DragAreaAboveDrogue[:-4])]) # 10.66 m^2
ds['DragAreaOfDrogue'] = (('traj'), [str_to_float(ds.DragAreaOfDrogue[:-4])]) # e.g. 416.6 m^2
ds['DragAreaRatio'] = (('traj'), [str_to_float(ds.DragAreaRatio)]) # e.g. 39.08
ds['DrogueCenterDepth'] = (('traj'), [str_to_float(ds.DrogueCenterDepth[:-2])]) # e.g. 15.0 m
ds['DrogueDetectSensor'] = (('traj'), cut_str(ds.DrogueDetectSensor, 15))
# convert type of some variable
ds['ID'] = ds['ID'].astype('int64')
ds['WMO'] = ds['WMO'].astype('int32')
ds['expno'] = ds['expno'].astype('int32')
ds['typedeath'] = ds['typedeath'].astype('int8')
ds['CurrentProgram'] = ds['CurrentProgram'].astype('int32')
ds['flg_sst'] = ds['flg_sst'].astype('int8')
ds['flg_sst1'] = ds['flg_sst1'].astype('int8')
ds['flg_sst2'] = ds['flg_sst2'].astype('int8')
# new variables
ds['ids'] = (['obs'], np.repeat(ds.ID.values, ds.dims['obs']),
{'long_name': "Trajectory index of vars['traj'] for all observations", 'units': '-'})
ds['drogue_status'] = (['obs'], drogue_presence(ds.drogue_lost_date.data, ds.time.data),
{'long_name': 'Status indicating the presence of the drogue',
'units':'-', 'flag_values':'1,0', 'flag_meanings': 'drogued, undrogued'})
# clean up global attributes
remove_attrs = ['id', 'location_type', 'wmo_platform_code',
'ncei_template_version', 'cdm_data_type', 'featureType',
'time_coverage_start', 'time_coverage_end',
'geospatial_lat_min', 'geospatial_lat_max',
'geospatial_lat_units', 'geospatial_lat_resolution',
'geospatial_lon_min', 'geospatial_lon_max',
'geospatial_lon_units', 'geospatial_lon_resolution',
'geospatial_bounds', 'date_created', 'date_modified',
'date_issued', 'licence', 'source', 'uuid', 'naming_authority',
'creator_name', 'creator_url', 'creator_email', 'comment',
'standard_name_vocabulary', 'instrument_vocabulary', 'keywords_vocabulary',
'keywords', 'DeployingShip', 'DeploymentStatus', 'BuoyTypeManufacturer',
'BuoyTypeSensorArray', 'CurrentProgram', 'PurchaserFunding', 'SensorUpgrade',
'Transmissions', 'DeployingCountry', 'DeploymentComments', 'ManufactureYear',
'ManufactureMonth', 'ManufactureSensorType', 'ManufactureVoltage', 'FloatDiameter',
'SubsfcFloatPresence', 'DrogueType', 'DrogueLength', 'DrogueBallast', 'DragAreaAboveDrogue',
'DragAreaOfDrogue', 'DragAreaRatio', 'DrogueCenterDepth', 'DrogueDetectSensor',
'interpolation_method', 'imei']
for name in remove_attrs:
del ds.attrs[name]
# add attributes
attrs = {
'title': 'Global Drifter Program hourly drifting buoy collection',
'history': 'Version 2.00. Metadata from dirall.dat and deplog.dat',
'Conventions': 'CF-1.6',
'date_created': datetime.now().isoformat(),
'publisher_name': 'GDP Drifter DAC',
'publisher_email': 'aoml.dftr@noaa.gov',
'publisher_url': 'https://www.aoml.noaa.gov/phod/gdp',
'licence': 'MIT License',
'processing_level': 'Level 2 QC by GDP drifter DAC',
'metadata_link': 'https://www.aoml.noaa.gov/phod/dac/dirall.html',
'contributor_name': 'NOAA Global Drifter Program',
'contributor_role': 'Data Acquisition Center',
'institution': 'NOAA Atlantic Oceanographic and Meteorological Laboratory',
'acknowledgement': 'Elipot et al. (2022) to be submitted. Elipot et al. (2016). Global Drifter Program quality-controlled hourly interpolated data from ocean surface drifting buoys, version 2.00. NOAA National Centers for Environmental Information. https://agupubs.onlinelibrary.wiley.com/doi/full/10.1002/2016JC011716TBA. Accessed [date].',
'summary': 'Global Drifter Program hourly data'
}
ds.attrs = attrs
# set date encoding
ds.deploy_date.encoding['units'] = 'seconds since 1970-01-01 00:00:00'
ds.end_date.encoding['units'] = 'seconds since 1970-01-01 00:00:00'
ds.drogue_lost_date.encoding['units'] = 'seconds since 1970-01-01 00:00:00'
ds.time.encoding['units'] = 'seconds since 1970-01-01 00:00:00'
# remove vars
ds = ds.drop_vars('lon360')
# set coordinates
ds = ds.set_coords(['longitude', 'latitude', 'time', 'ids']) # ID
# normally ID would be in coords but here we are removing the 'traj' dimension
ds = ds.drop_vars('ID')
# keep only [obs] variable for now
ds = ds.isel(traj=0)
ds = ds[['longitude', 'latitude', 'time', 'ids',
've', 'vn', 'err_lat', 'err_lon', 'err_ve', 'err_vn', 'gap',
'sst', 'sst1', 'sst2', 'err_sst', 'err_sst1', 'err_sst2',
'flg_sst', 'flg_sst1', 'flg_sst2', 'drogue_status']]
return ds
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment