Skip to content

Instantly share code, notes, and snippets.

@robbibt
Created January 31, 2024 00:43
Show Gist options
  • Save robbibt/13823d0f860c54360d84ee283fc610c1 to your computer and use it in GitHub Desktop.
Save robbibt/13823d0f860c54360d84ee283fc610c1 to your computer and use it in GitHub Desktop.
GESLA and ABSLMP tide gauge loading functions
import glob
import warnings
import datetime
from odc.geo.geom import BoundingBox
def _load_gauge_metadata(metadata_path):
# Load metadata
metadata_df = pd.read_csv(metadata_path)
metadata_df.columns = (
metadata_df.columns.str.replace(" ", "_", regex=False)
.str.replace("(", "", regex=False)
.str.replace(")", "", regex=False)
.str.replace("/", "_", regex=False)
.str.lower()
)
metadata_df = metadata_df.set_index("site_code")
# Convert metadata to GeoDataFrame
metadata_gdf = gpd.GeoDataFrame(
data=metadata_df,
geometry=gpd.points_from_xy(metadata_df.longitude, metadata_df.latitude),
crs="EPSG:4326",
)
return metadata_df, metadata_gdf
def tide_gauge_abslmp(
x=None,
y=None,
site_code=None,
time=("2020", "2021"),
ahd=True,
site_metadata=True,
data_path="/gdata1/data/sea_level/abslmp/",
metadata_path="/gdata1/data/sea_level/ABSLMP_station_metadata_v2.csv",
):
"""
Load and process Australian Baseline Sea Level Monitoring Program
(ABSLMP) tide gauge data.
Parameters
----------
x, y : tuple, optional
Tuples defining the x and y bounding box within which to load
tide gauge data, in WGS84 (degrees latitude, longitude) units.
Leave as None if providing a list of site codes using 'site_code'.
site_code : str or list of str, optional
ABSLMP site code(s) for which to load data. If provided, 'x' and
'y' will be ignored.
time : tuple or list of str, optional
Time range to consider, given as a tuple of start and end years.
If None, will default to all tide observations from 1991 onward.
Default is ("2020", "2021").
ahd : bool, optional
Whether to correct sea level to Australian Height Datum (AHD).
Default is True.
site_metadata : bool, optional
Whether to add tide gauge station metadata as additional columns
in the output DataFrame. Defaults to True.
data_path : str, optional
Path to the raw ABSLMP data files. Default is
"/gdata1/data/sea_level/abslmp/".
metadata_path : str, optional
Path to the ABSLMP station metadata file.
Default is "/gdata1/data/sea_level/ABSLMP_station_metadata_v2.csv".
Returns
-------
pd.DataFrame
Processed ABSLMP data as a DataFrame with columns including:
"time": Timestamps,
"sea_level": Observed sea level (m),
"residuals": Residuals data (m),
and additional columns from station metadata.
"""
def _load_abslmp_dataset(path, na_value):
abslmp_df = (
pd.read_csv(
path,
parse_dates=[" Date & UTC Time"],
na_values=na_value,
usecols=[" Date & UTC Time", "Sea Level", "Residuals"],
)
.rename(
{
" Date & UTC Time": "time",
"Sea Level": "sea_level",
"Residuals": "residuals",
},
axis=1,
)
.assign(site_code=path[-17:-9])
.set_index("time")
)
return abslmp_df
# Load tide gauge metadata
metadata_df, metadata_gdf = _load_gauge_metadata(metadata_path)
# Use supplied site codes if available
if site_code is not None:
site_code = [site_code] if isinstance(site_code, str) else site_code
# Otherwise, use xy bounds to identify sites
elif x is not None:
bbox = BoundingBox.from_xy(x, y)
site_code = metadata_gdf.cx[
bbox.left : bbox.right, bbox.top : bbox.bottom
].index
# Otherwise, return all available site codes
else:
site_code = metadata_df.index.to_list()
# Prepare times
if time is None:
time = ["1991", str(datetime.datetime.now().year)]
time = [time] if isinstance(time, str) else time
start_time = round_date_strings(time[0], round_type="start")
end_time = round_date_strings(time[-1], round_type="end")
# Identify paths to load and nodata values for each site
years = list(range(int(start_time[0:4]), int(end_time[0:4]) + 1))
paths_na = [
(glob.glob(f"{data_path}/{s}_*{y}.csv"), metadata_df.loc[s].null_value)
for y in years
for s in site_code
]
# Expand so we have a nodata value for each path, then load and
# combine into a single dataframe
paths_na = [(path, na) for paths, na in paths_na for path in paths]
data_df = (
pd.concat([_load_abslmp_dataset(path, na_value=na) for path, na in paths_na])
.loc[slice(start_time, end_time)]
.reset_index()
.set_index("site_code")
)
# Insert metadata into dataframe
data_df[metadata_df.columns] = metadata_df
# Add time to index and remove duplicates
data_df = data_df.set_index("time", append=True)
duplicates = data_df.index.duplicated()
if duplicates.sum() > 0:
warnings.warn("Duplicate timestamps were removed.")
data_df = data_df.loc[~duplicates]
# Correct to AHD (i.e. mean sea level)
if ahd:
data_df["sea_level"] -= data_df.ahd
# Return data
if not site_metadata:
return data_df[["sea_level", "residuals"]]
else:
return data_df
def tide_gauge_gesla(
x=None,
y=None,
site_code=None,
time=("2020", "2021"),
filter_use_flag=True,
site_metadata=True,
data_path="/gdata1/data/sea_level/gesla/",
metadata_path="/gdata1/data/sea_level/GESLA3_ALL 2.csv",
):
"""
Load and process Global Extreme Sea Level Analysis (GESLA) tide
gauge data.
Modified from original code from https://github.com/philiprt/GeslaDataset.
Parameters
----------
x, y : tuple, optional
Tuples defining the x and y bounding box within which to load
tide gauge data, in WGS84 (degrees latitude, longitude) units.
Leave as None if providing a list of site codes using 'site_code'.
site_code : str or list of str, optional
GESLA site code(s) for which to load data. If provided, 'x' and
'y' will be ignored.
time : tuple or list of str, optional
Time range to consider, given as a tuple of start and end years.
If None, will default to all tide observations from 1800 onward.
Default is ("2020", "2021").
filter_use_flag : bool, optional
Whether to filter out low quality observations with a "use_flag"
value of 0 (do not use). Defaults to True.
site_metadata : bool, optional
Whether to add tide gauge station metadata as additional columns
in the output DataFrame. Defaults to True.
data_path : str, optional
Path to the raw GESLA data files. Default is
"/gdata1/data/sea_level/gesla/".
metadata_path : str, optional
Path to the GESLA station metadata file.
Default is "/gdata1/data/sea_level/GESLA3_ALL 2.csv".
Returns
-------
pd.DataFrame
Processed GESLA data as a DataFrame with columns including:
"time": Timestamps,
"sea_level": Observed sea level (m),
"qc_flag": Observed sea level QC flag,
"use_flag": Use-in-analysis flag (1 = use, 0 = do not use),
and additional columns from station metadata.
"""
def _load_gesla_dataset(site, path, na_value):
gesla_df = (
pd.read_csv(
path,
skiprows=41,
names=["date", "time", "sea_level", "qc_flag", "use_flag"],
sep="\s+",
parse_dates=[[0, 1]],
index_col=0,
na_values=na_value,
)
.rename_axis("time")
.assign(site_code=site)
)
return gesla_df
# Load tide gauge metadata
metadata_df, metadata_gdf = _load_gauge_metadata(metadata_path)
# Use supplied site codes if available
if site_code is not None:
site_code = [site_code] if isinstance(site_code, str) else site_code
# Otherwise, use xy bounds to identify sites
elif x is not None:
bbox = BoundingBox.from_xy(x, y)
site_code = metadata_gdf.cx[
bbox.left : bbox.right, bbox.top : bbox.bottom
].index
# Otherwise, return all available site codes
else:
site_code = metadata_df.index.to_list()
# Prepare times
if time is None:
time = ["1800", str(datetime.datetime.now().year)]
time = [time] if isinstance(time, str) else time
start_time = round_date_strings(time[0], round_type="start")
end_time = round_date_strings(time[-1], round_type="end")
# Identify paths to load and nodata values for each site
metadata_df["file_name"] = data_path + metadata_df["file_name"]
paths_na = metadata_df.loc[site_code, ["file_name", "null_value"]]
# Load and combine into a single dataframe
data_df = (
pd.concat(
[
_load_gesla_dataset(s, p, na_value=na)
for s, p, na in paths_na.itertuples()
]
)
.sort_index()
.loc[slice(start_time, end_time)]
.reset_index()
.set_index("site_code")
)
# Optionally filter by use flag column
if filter_use_flag:
data_df = data_df.loc[data_df.use_flag == 1]
# Optionally insert metadata into dataframe
if site_metadata:
data_df[metadata_df.columns] = metadata_df.loc[site_code]
# Add time to index and remove duplicates
data_df = data_df.set_index("time", append=True)
duplicates = data_df.index.duplicated()
if duplicates.sum() > 0:
warnings.warn("Duplicate timestamps were removed.")
data_df = data_df.loc[~duplicates]
# Return data
return data_df
# tide_gauge_abslmp(x=(140, 160), y=(-30, -35))
# tide_gauge_gesla(x=(140, 160), y=(-30, -35))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment