Last active
March 24, 2022 19:47
-
-
Save philippemiron/c6f1509568a01d61d4bbaf16153ffa2a to your computer and use it in GitHub Desktop.
preprocess for gdp stage-recipe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from datetime import datetime | |
def decode_date(t): | |
''' | |
The date format is specified in 'seconds since 1970-01-01 00:00:00' but the missing values | |
are stored as -1e+34 which is not supported by the default parsing mechanism in xarray | |
This function returns replaced the missing valye by NaT and return a datetime object. | |
:param t: date | |
:return: datetime object | |
''' | |
nat_index = np.logical_or(np.isclose(t, -1e+34), np.isnan(t)) | |
t[nat_index] = np.nan | |
return np.array(pd.to_datetime(t, unit='s', origin='unix')) | |
def fill_values(var, default=np.nan): | |
''' | |
Change fill values (-1e+34, inf, -inf) in var array to value specifed by default | |
''' | |
missing_value = np.logical_or(np.isclose(var, -1e+34), ~np.isfinite(var)) | |
if np.any(missing_value): | |
var[missing_value] = default | |
return var | |
def str_to_float(value, default=np.nan): | |
''' | |
:param value: string | |
:return: bool | |
''' | |
try: | |
fvalue = float(value) | |
if np.isnan(fvalue): | |
return default | |
else: | |
return fvalue | |
except ValueError: | |
return default | |
def cut_str(value, max_length): | |
''' | |
Cut a string to a specific length. | |
:param value: string | |
max_length: length of the output | |
:return: string with max_length chars | |
''' | |
charar = np.chararray(1, max_length) | |
charar[:max_length] = value | |
return charar | |
def drogue_presence(lost_time, time): | |
''' | |
Create drogue status from the drogue lost time and the trajectory time | |
:params lost_time: timestamp of the drogue loss (or NaT) | |
time[obs]: observation time | |
:return: bool[obs]: 1 drogued, 0 undrogued | |
''' | |
if pd.isnull(lost_time) or lost_time >= time[-1]: | |
return np.ones_like(time, dtype='bool') | |
else: | |
return time < lost_time | |
def preprocess_gdp(ds): | |
# parse the date with custom function | |
ds.update({'deploy_date': (['traj'], decode_date(np.array([ds.deploy_date.data[0]])))}) | |
ds.update({'end_date': (['traj'], decode_date(np.array([ds.end_date.data[0]])))}) | |
ds.update({'drogue_lost_date': (['traj'], decode_date(np.array([ds.drogue_lost_date.data[0]])))}) | |
ds.update({'time': (['obs'], decode_date(ds.time.data[0]))}) | |
# convert fill values to nan | |
ds['sst'].data = fill_values(ds['sst'].data) | |
ds['sst1'].data = fill_values(ds['sst1'].data) | |
ds['sst2'].data = fill_values(ds['sst2'].data) | |
ds['err_sst'].data = fill_values(ds['err_sst'].data) | |
ds['err_sst1'].data = fill_values(ds['err_sst1'].data) | |
ds['err_sst2'].data = fill_values(ds['err_sst2'].data) | |
# convert attributes to variable | |
ds['location_type'] = (('traj'), [False if ds.location_type == 'Argos' else True]) # 0 for Argos, 1 for GPS | |
ds['DeployingShip'] = (('traj'), cut_str(ds.DeployingShip, 15)) | |
ds['DeploymentStatus'] = (('traj'), cut_str(ds.DeploymentStatus, 15)) | |
ds['BuoyTypeManufacturer'] = (('traj'), cut_str(ds.BuoyTypeManufacturer, 15)) | |
ds['BuoyTypeSensorArray'] = (('traj'), cut_str(ds.BuoyTypeSensorArray, 15)) | |
ds['CurrentProgram'] = (('traj'), np.int32([str_to_float(ds.CurrentProgram, -1)])) | |
ds['PurchaserFunding'] = (('traj'), cut_str(ds.PurchaserFunding, 15)) | |
ds['SensorUpgrade'] = (('traj'), cut_str(ds.SensorUpgrade, 15)) | |
ds['Transmissions'] = (('traj'), cut_str(ds.Transmissions, 15)) | |
ds['DeployingCountry'] = (('traj'), cut_str(ds.DeployingCountry, 15)) | |
ds['DeploymentComments'] = (('traj'), cut_str(ds.DeploymentComments, 15)) | |
ds['ManufactureYear'] = (('traj'), np.int16([str_to_float(ds.ManufactureYear, -1)])) | |
ds['ManufactureMonth'] = (('traj'), np.int16([str_to_float(ds.ManufactureMonth, -1)])) | |
ds['ManufactureSensorType'] = (('traj'), cut_str(ds.ManufactureSensorType, 15)) | |
ds['ManufactureVoltage'] = (('traj'), np.int16([str_to_float(ds.ManufactureVoltage[:-6], -1)])) # e.g. 56 V | |
ds['FloatDiameter'] = (('traj'), [str_to_float(ds.FloatDiameter[:-3])]) # e.g. 35.5 cm | |
ds['SubsfcFloatPresence'] = (('traj'), np.array([str_to_float(ds.SubsfcFloatPresence)], dtype='bool')) | |
ds['DrogueType'] = (('traj'), cut_str(ds.DrogueType, 7)) | |
ds['DrogueLength'] = (('traj'), [str_to_float(ds.DrogueLength[:-2])]) # e.g. 4.8 m | |
ds['DrogueBallast'] = (('traj'), [str_to_float(ds.DrogueBallast[:-3])]) # e.g. 1.4 kg | |
ds['DragAreaAboveDrogue'] = (('traj'), [str_to_float(ds.DragAreaAboveDrogue[:-4])]) # 10.66 m^2 | |
ds['DragAreaOfDrogue'] = (('traj'), [str_to_float(ds.DragAreaOfDrogue[:-4])]) # e.g. 416.6 m^2 | |
ds['DragAreaRatio'] = (('traj'), [str_to_float(ds.DragAreaRatio)]) # e.g. 39.08 | |
ds['DrogueCenterDepth'] = (('traj'), [str_to_float(ds.DrogueCenterDepth[:-2])]) # e.g. 15.0 m | |
ds['DrogueDetectSensor'] = (('traj'), cut_str(ds.DrogueDetectSensor, 15)) | |
# convert type of some variable | |
ds['ID'] = ds['ID'].astype('int64') | |
ds['WMO'] = ds['WMO'].astype('int32') | |
ds['expno'] = ds['expno'].astype('int32') | |
ds['typedeath'] = ds['typedeath'].astype('int8') | |
ds['CurrentProgram'] = ds['CurrentProgram'].astype('int32') | |
ds['flg_sst'] = ds['flg_sst'].astype('int8') | |
ds['flg_sst1'] = ds['flg_sst1'].astype('int8') | |
ds['flg_sst2'] = ds['flg_sst2'].astype('int8') | |
# new variables | |
ds['ids'] = (['obs'], np.repeat(ds.ID.values, ds.dims['obs']), | |
{'long_name': "Trajectory index of vars['traj'] for all observations", 'units': '-'}) | |
ds['drogue_status'] = (['obs'], drogue_presence(ds.drogue_lost_date.data, ds.time.data), | |
{'long_name': 'Status indicating the presence of the drogue', | |
'units':'-', 'flag_values':'1,0', 'flag_meanings': 'drogued, undrogued'}) | |
# clean up global attributes | |
remove_attrs = ['id', 'location_type', 'wmo_platform_code', | |
'ncei_template_version', 'cdm_data_type', 'featureType', | |
'time_coverage_start', 'time_coverage_end', | |
'geospatial_lat_min', 'geospatial_lat_max', | |
'geospatial_lat_units', 'geospatial_lat_resolution', | |
'geospatial_lon_min', 'geospatial_lon_max', | |
'geospatial_lon_units', 'geospatial_lon_resolution', | |
'geospatial_bounds', 'date_created', 'date_modified', | |
'date_issued', 'licence', 'source', 'uuid', 'naming_authority', | |
'creator_name', 'creator_url', 'creator_email', 'comment', | |
'standard_name_vocabulary', 'instrument_vocabulary', 'keywords_vocabulary', | |
'keywords', 'DeployingShip', 'DeploymentStatus', 'BuoyTypeManufacturer', | |
'BuoyTypeSensorArray', 'CurrentProgram', 'PurchaserFunding', 'SensorUpgrade', | |
'Transmissions', 'DeployingCountry', 'DeploymentComments', 'ManufactureYear', | |
'ManufactureMonth', 'ManufactureSensorType', 'ManufactureVoltage', 'FloatDiameter', | |
'SubsfcFloatPresence', 'DrogueType', 'DrogueLength', 'DrogueBallast', 'DragAreaAboveDrogue', | |
'DragAreaOfDrogue', 'DragAreaRatio', 'DrogueCenterDepth', 'DrogueDetectSensor', | |
'interpolation_method', 'imei'] | |
for name in remove_attrs: | |
del ds.attrs[name] | |
# add attributes | |
attrs = { | |
'title': 'Global Drifter Program hourly drifting buoy collection', | |
'history': 'Version 2.00. Metadata from dirall.dat and deplog.dat', | |
'Conventions': 'CF-1.6', | |
'date_created': datetime.now().isoformat(), | |
'publisher_name': 'GDP Drifter DAC', | |
'publisher_email': 'aoml.dftr@noaa.gov', | |
'publisher_url': 'https://www.aoml.noaa.gov/phod/gdp', | |
'licence': 'MIT License', | |
'processing_level': 'Level 2 QC by GDP drifter DAC', | |
'metadata_link': 'https://www.aoml.noaa.gov/phod/dac/dirall.html', | |
'contributor_name': 'NOAA Global Drifter Program', | |
'contributor_role': 'Data Acquisition Center', | |
'institution': 'NOAA Atlantic Oceanographic and Meteorological Laboratory', | |
'acknowledgement': 'Elipot et al. (2022) to be submitted. Elipot et al. (2016). Global Drifter Program quality-controlled hourly interpolated data from ocean surface drifting buoys, version 2.00. NOAA National Centers for Environmental Information. https://agupubs.onlinelibrary.wiley.com/doi/full/10.1002/2016JC011716TBA. Accessed [date].', | |
'summary': 'Global Drifter Program hourly data' | |
} | |
ds.attrs = attrs | |
# set date encoding | |
ds.deploy_date.encoding['units'] = 'seconds since 1970-01-01 00:00:00' | |
ds.end_date.encoding['units'] = 'seconds since 1970-01-01 00:00:00' | |
ds.drogue_lost_date.encoding['units'] = 'seconds since 1970-01-01 00:00:00' | |
ds.time.encoding['units'] = 'seconds since 1970-01-01 00:00:00' | |
# remove vars | |
ds = ds.drop_vars('lon360') | |
# set coordinates | |
ds = ds.set_coords(['longitude', 'latitude', 'time', 'ids']) # ID | |
# normally ID would be in coords but here we are removing the 'traj' dimension | |
ds = ds.drop_vars('ID') | |
# keep only [obs] variable for now | |
ds = ds.isel(traj=0) | |
ds = ds[['longitude', 'latitude', 'time', 'ids', | |
've', 'vn', 'err_lat', 'err_lon', 'err_ve', 'err_vn', 'gap', | |
'sst', 'sst1', 'sst2', 'err_sst', 'err_sst1', 'err_sst2', | |
'flg_sst', 'flg_sst1', 'flg_sst2', 'drogue_status']] | |
return ds |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment