Created
January 31, 2024 00:43
-
-
Save robbibt/13823d0f860c54360d84ee283fc610c1 to your computer and use it in GitHub Desktop.
GESLA and ABSLMP tide gauge loading functions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import warnings | |
import datetime | |
from odc.geo.geom import BoundingBox | |
def _load_gauge_metadata(metadata_path): | |
# Load metadata | |
metadata_df = pd.read_csv(metadata_path) | |
metadata_df.columns = ( | |
metadata_df.columns.str.replace(" ", "_", regex=False) | |
.str.replace("(", "", regex=False) | |
.str.replace(")", "", regex=False) | |
.str.replace("/", "_", regex=False) | |
.str.lower() | |
) | |
metadata_df = metadata_df.set_index("site_code") | |
# Convert metadata to GeoDataFrame | |
metadata_gdf = gpd.GeoDataFrame( | |
data=metadata_df, | |
geometry=gpd.points_from_xy(metadata_df.longitude, metadata_df.latitude), | |
crs="EPSG:4326", | |
) | |
return metadata_df, metadata_gdf | |
def tide_gauge_abslmp( | |
x=None, | |
y=None, | |
site_code=None, | |
time=("2020", "2021"), | |
ahd=True, | |
site_metadata=True, | |
data_path="/gdata1/data/sea_level/abslmp/", | |
metadata_path="/gdata1/data/sea_level/ABSLMP_station_metadata_v2.csv", | |
): | |
""" | |
Load and process Australian Baseline Sea Level Monitoring Program | |
(ABSLMP) tide gauge data. | |
Parameters | |
---------- | |
x, y : tuple, optional | |
Tuples defining the x and y bounding box within which to load | |
tide gauge data, in WGS84 (degrees latitude, longitude) units. | |
Leave as None if providing a list of site codes using 'site_code'. | |
site_code : str or list of str, optional | |
ABSLMP site code(s) for which to load data. If provided, 'x' and | |
'y' will be ignored. | |
time : tuple or list of str, optional | |
Time range to consider, given as a tuple of start and end years. | |
If None, will default to all tide observations from 1991 onward. | |
Default is ("2020", "2021"). | |
ahd : bool, optional | |
Whether to correct sea level to Australian Height Datum (AHD). | |
Default is True. | |
site_metadata : bool, optional | |
Whether to add tide gauge station metadata as additional columns | |
in the output DataFrame. Defaults to True. | |
data_path : str, optional | |
Path to the raw ABSLMP data files. Default is | |
"/gdata1/data/sea_level/abslmp/". | |
metadata_path : str, optional | |
Path to the ABSLMP station metadata file. | |
Default is "/gdata1/data/sea_level/ABSLMP_station_metadata_v2.csv". | |
Returns | |
------- | |
pd.DataFrame | |
Processed ABSLMP data as a DataFrame with columns including: | |
"time": Timestamps, | |
"sea_level": Observed sea level (m), | |
"residuals": Residuals data (m), | |
and additional columns from station metadata. | |
""" | |
def _load_abslmp_dataset(path, na_value): | |
abslmp_df = ( | |
pd.read_csv( | |
path, | |
parse_dates=[" Date & UTC Time"], | |
na_values=na_value, | |
usecols=[" Date & UTC Time", "Sea Level", "Residuals"], | |
) | |
.rename( | |
{ | |
" Date & UTC Time": "time", | |
"Sea Level": "sea_level", | |
"Residuals": "residuals", | |
}, | |
axis=1, | |
) | |
.assign(site_code=path[-17:-9]) | |
.set_index("time") | |
) | |
return abslmp_df | |
# Load tide gauge metadata | |
metadata_df, metadata_gdf = _load_gauge_metadata(metadata_path) | |
# Use supplied site codes if available | |
if site_code is not None: | |
site_code = [site_code] if isinstance(site_code, str) else site_code | |
# Otherwise, use xy bounds to identify sites | |
elif x is not None: | |
bbox = BoundingBox.from_xy(x, y) | |
site_code = metadata_gdf.cx[ | |
bbox.left : bbox.right, bbox.top : bbox.bottom | |
].index | |
# Otherwise, return all available site codes | |
else: | |
site_code = metadata_df.index.to_list() | |
# Prepare times | |
if time is None: | |
time = ["1991", str(datetime.datetime.now().year)] | |
time = [time] if isinstance(time, str) else time | |
start_time = round_date_strings(time[0], round_type="start") | |
end_time = round_date_strings(time[-1], round_type="end") | |
# Identify paths to load and nodata values for each site | |
years = list(range(int(start_time[0:4]), int(end_time[0:4]) + 1)) | |
paths_na = [ | |
(glob.glob(f"{data_path}/{s}_*{y}.csv"), metadata_df.loc[s].null_value) | |
for y in years | |
for s in site_code | |
] | |
# Expand so we have a nodata value for each path, then load and | |
# combine into a single dataframe | |
paths_na = [(path, na) for paths, na in paths_na for path in paths] | |
data_df = ( | |
pd.concat([_load_abslmp_dataset(path, na_value=na) for path, na in paths_na]) | |
.loc[slice(start_time, end_time)] | |
.reset_index() | |
.set_index("site_code") | |
) | |
# Insert metadata into dataframe | |
data_df[metadata_df.columns] = metadata_df | |
# Add time to index and remove duplicates | |
data_df = data_df.set_index("time", append=True) | |
duplicates = data_df.index.duplicated() | |
if duplicates.sum() > 0: | |
warnings.warn("Duplicate timestamps were removed.") | |
data_df = data_df.loc[~duplicates] | |
# Correct to AHD (i.e. mean sea level) | |
if ahd: | |
data_df["sea_level"] -= data_df.ahd | |
# Return data | |
if not site_metadata: | |
return data_df[["sea_level", "residuals"]] | |
else: | |
return data_df | |
def tide_gauge_gesla( | |
x=None, | |
y=None, | |
site_code=None, | |
time=("2020", "2021"), | |
filter_use_flag=True, | |
site_metadata=True, | |
data_path="/gdata1/data/sea_level/gesla/", | |
metadata_path="/gdata1/data/sea_level/GESLA3_ALL 2.csv", | |
): | |
""" | |
Load and process Global Extreme Sea Level Analysis (GESLA) tide | |
gauge data. | |
Modified from original code from https://github.com/philiprt/GeslaDataset. | |
Parameters | |
---------- | |
x, y : tuple, optional | |
Tuples defining the x and y bounding box within which to load | |
tide gauge data, in WGS84 (degrees latitude, longitude) units. | |
Leave as None if providing a list of site codes using 'site_code'. | |
site_code : str or list of str, optional | |
GESLA site code(s) for which to load data. If provided, 'x' and | |
'y' will be ignored. | |
time : tuple or list of str, optional | |
Time range to consider, given as a tuple of start and end years. | |
If None, will default to all tide observations from 1800 onward. | |
Default is ("2020", "2021"). | |
filter_use_flag : bool, optional | |
Whether to filter out low quality observations with a "use_flag" | |
value of 0 (do not use). Defaults to True. | |
site_metadata : bool, optional | |
Whether to add tide gauge station metadata as additional columns | |
in the output DataFrame. Defaults to True. | |
data_path : str, optional | |
Path to the raw GESLA data files. Default is | |
"/gdata1/data/sea_level/gesla/". | |
metadata_path : str, optional | |
Path to the GESLA station metadata file. | |
Default is "/gdata1/data/sea_level/GESLA3_ALL 2.csv". | |
Returns | |
------- | |
pd.DataFrame | |
Processed GESLA data as a DataFrame with columns including: | |
"time": Timestamps, | |
"sea_level": Observed sea level (m), | |
"qc_flag": Observed sea level QC flag, | |
"use_flag": Use-in-analysis flag (1 = use, 0 = do not use), | |
and additional columns from station metadata. | |
""" | |
def _load_gesla_dataset(site, path, na_value): | |
gesla_df = ( | |
pd.read_csv( | |
path, | |
skiprows=41, | |
names=["date", "time", "sea_level", "qc_flag", "use_flag"], | |
sep="\s+", | |
parse_dates=[[0, 1]], | |
index_col=0, | |
na_values=na_value, | |
) | |
.rename_axis("time") | |
.assign(site_code=site) | |
) | |
return gesla_df | |
# Load tide gauge metadata | |
metadata_df, metadata_gdf = _load_gauge_metadata(metadata_path) | |
# Use supplied site codes if available | |
if site_code is not None: | |
site_code = [site_code] if isinstance(site_code, str) else site_code | |
# Otherwise, use xy bounds to identify sites | |
elif x is not None: | |
bbox = BoundingBox.from_xy(x, y) | |
site_code = metadata_gdf.cx[ | |
bbox.left : bbox.right, bbox.top : bbox.bottom | |
].index | |
# Otherwise, return all available site codes | |
else: | |
site_code = metadata_df.index.to_list() | |
# Prepare times | |
if time is None: | |
time = ["1800", str(datetime.datetime.now().year)] | |
time = [time] if isinstance(time, str) else time | |
start_time = round_date_strings(time[0], round_type="start") | |
end_time = round_date_strings(time[-1], round_type="end") | |
# Identify paths to load and nodata values for each site | |
metadata_df["file_name"] = data_path + metadata_df["file_name"] | |
paths_na = metadata_df.loc[site_code, ["file_name", "null_value"]] | |
# Load and combine into a single dataframe | |
data_df = ( | |
pd.concat( | |
[ | |
_load_gesla_dataset(s, p, na_value=na) | |
for s, p, na in paths_na.itertuples() | |
] | |
) | |
.sort_index() | |
.loc[slice(start_time, end_time)] | |
.reset_index() | |
.set_index("site_code") | |
) | |
# Optionally filter by use flag column | |
if filter_use_flag: | |
data_df = data_df.loc[data_df.use_flag == 1] | |
# Optionally insert metadata into dataframe | |
if site_metadata: | |
data_df[metadata_df.columns] = metadata_df.loc[site_code] | |
# Add time to index and remove duplicates | |
data_df = data_df.set_index("time", append=True) | |
duplicates = data_df.index.duplicated() | |
if duplicates.sum() > 0: | |
warnings.warn("Duplicate timestamps were removed.") | |
data_df = data_df.loc[~duplicates] | |
# Return data | |
return data_df | |
# tide_gauge_abslmp(x=(140, 160), y=(-30, -35)) | |
# tide_gauge_gesla(x=(140, 160), y=(-30, -35)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment