Last active
February 6, 2025 01:57
-
-
Save scottstanie/ac00b77b1d5e6b9dcaa904a9d14c8fc3 to your computer and use it in GitHub Desktop.
Export dolphin, or DISP, time series rasters to a GeoParquet format ingestible by QGIS
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # /// script | |
| # dependencies = ["geopandas", "rioxarray", "tyro", "opera_utils"] | |
| # /// | |
| """Create a GeoParquet file from multi-date InSAR displacement GeoTIFFs. | |
| Optionally mask by similarity and coherence thresholds. | |
| Example CLI usage (with tyro): | |
| pipx run export_to_qgis.py \ | |
| --timeseries-pattern "timeseries/*.tif" \ | |
| --similarity-path "interferograms/similarity.tif" \ | |
| --similarity-threshold 0.5 \ | |
| --temporal-coherence-path "interferograms/temporal_coherence.tif" \ | |
| --temporal-coherence-threshold 0.6 \ | |
| --output-file "output_displacement.parquet" | |
| """ | |
| from __future__ import annotations | |
| from typing import Literal | |
| from pathlib import Path | |
| import geopandas as gpd | |
| import rioxarray | |
| import tyro | |
| import xarray as xr | |
| from opera_utils import get_dates | |
| def _preprocess_band(ds: xr.DataArray) -> xr.DataArray: | |
| """Expand a single-band raster to have a 'time' dimension derived from the filename. | |
| Converts the DataArray from shape (y, x) -> (time, y, x). | |
| Parameters | |
| ---------- | |
| ds : xr.DataArray | |
| A 2D or 3D data array from rioxarray.open_rasterio. | |
| fname : Path | |
| The corresponding GeoTIFF filename. | |
| Returns | |
| ------- | |
| xr.DataArray | |
| DataArray with an added 'time' dimension of size 1. | |
| """ | |
| fname = ds.encoding["source"] | |
| # If multiple bands, just pick the first or consider additional logic. | |
| if "band" in ds.coords and ds.coords["band"].size > 1: | |
| ds = ds.isel(band=0) | |
| # Use the secondary date as time dimension | |
| # TODO: Ensure it's a single-reference timeseries? | |
| date = get_dates(fname)[1] | |
| return ds.expand_dims({"time": [date]}) | |
| def create_geoparquet( | |
| timeseries_pattern: str, | |
| output_file: Path, | |
| similarity: Path | None = None, | |
| similarity_threshold: float = 0.5, | |
| temporal_coherence: Path | None = None, | |
| temporal_coherence_threshold: float = 0.6, | |
| coord_dtype: str = "float32", | |
| units: Literal["meters", "millimeters"] = "millimeters", | |
| ) -> Path: | |
| """Combine multiple single-date displacement GeoTIFFs into vector format. | |
| Optionally mask by similarity and temporal coherence rasters. | |
| Parameters | |
| ---------- | |
| timeseries_pattern : str | |
| Glob pattern for input time-series GeoTIFFs, e.g. "timeseries/*.tif". | |
| output_file : Path | |
| Output file path (Parquet format). E.g., "output.parquet" | |
| similarity : [Path], default=None | |
| Path to similarity raster (e.g., "interferograms/similarity.tif"). | |
| If provided, pixels below 'similarity_threshold' will be masked out. | |
| similarity_threshold : float, default=0.5 | |
| Similarity threshold (0.0-1.0). Only used if similarity is provided. | |
| temporal_coherence : [Path], default=None | |
| Path to temporal coherence raster (e.g., "interferograms/temporal_coherence.tif") | |
| If provided, pixels below 'temporal_coherence_threshold' will be masked out. | |
| temporal_coherence_threshold : float, default=0.6 | |
| Temporal coherence threshold (0.0-1.0). | |
| Only used if temporal_coherence is provided. | |
| coord_dtype : str, default="float32" | |
| Data type to use for the x and y coordinates. | |
| If your raster is in EPSG:4326, x=longitude, y=latitude. | |
| Otherwise it could be projected coords (UTM, etc.) with int coordinates. | |
| units : Literal["meters", "millimeters"], default="millimeters" | |
| Units of the displacement values. | |
| If "millimeters", the displacement values will be multiplied by 1000. | |
| Returns | |
| ------- | |
| Path | |
| The path to the written GeoParquet file. | |
| """ | |
| # Gather all timeseries tifs | |
| tif_files = sorted(Path().glob(timeseries_pattern)) | |
| if not tif_files: | |
| raise FileNotFoundError( | |
| f"No files found matching pattern: {timeseries_pattern}" | |
| ) | |
| ds = ( | |
| xr.open_mfdataset(tif_files, preprocess=_preprocess_band, parallel=True) | |
| .rename({"band_data": "displacement"}) | |
| .drop_vars(["band"]) | |
| ) | |
| index = ["x", "y", "geometry"] | |
| # Apply optional similarity mask | |
| if similarity: | |
| sim_da = rioxarray.open_rasterio(similarity, masked=True).squeeze(drop=True) | |
| # Add similarity to the dataset | |
| ds["similarity"] = sim_da | |
| ds = ds.where(sim_da > similarity_threshold) | |
| index.append("similarity") | |
| # Apply optional temporal coherence mask | |
| if temporal_coherence: | |
| coh_da = rioxarray.open_rasterio(temporal_coherence, masked=True).squeeze( | |
| drop=True | |
| ) | |
| # Add temporal coherence to the dataset | |
| ds["temporal_coherence"] = coh_da | |
| ds = ds.where(coh_da > temporal_coherence_threshold) | |
| index.append("temporal_coherence") | |
| # Ensure we have a valid CRS in ds. If not set, you can do ds.rio.write_crs("EPSG:4326", inplace=True) | |
| crs = ds.rio.crs | |
| if not crs: | |
| raise ValueError( | |
| "No CRS found in the input rasters. Please ensure GeoTIFFs have a valid" | |
| " projection." | |
| ) | |
| # Flatten the time-series to a table: (time, y, x, displacement) | |
| # Convert from xarray to a standard DataFrame | |
| df = ds.to_dataframe().dropna().reset_index() | |
| # TODO: For UTM, do we want this as integers? | |
| # df.x = df.x.astype(int) | |
| # df.y = df.y.astype(int) | |
| df.drop(columns=["spatial_ref", "band"], inplace=True, errors="ignore") | |
| if units == "millimeters": | |
| df["displacement"] *= 1000 | |
| # Convert from (x, y) pixel coords to geometry | |
| # If your raster is in EPSG:4326, x=longitude, y=latitude. | |
| # Otherwise it could be projected coords (UTM, etc.) | |
| # We rely on the existing coordinate definitions from rioxarray. | |
| # They should be in 'x' and 'y'. | |
| geometry = gpd.points_from_xy(x=df["x"], y=df["y"], crs=crs) | |
| # Create a GeoDataFrame with the geometry | |
| gdf = gpd.GeoDataFrame(df, geometry=geometry, crs=crs) | |
| gdf.x = gdf.x.astype(coord_dtype) | |
| gdf.y = gdf.y.astype(coord_dtype) | |
| # Convert time column to the required format (DYYYYMMDD) | |
| gdf.loc[:, "date_col"] = "D" + gdf["time"].dt.strftime("%Y%m%d") | |
| # Pivot the dataframe | |
| wide_gdf = gdf.pivot( | |
| index=index, | |
| columns="date_col", | |
| values="displacement", | |
| ) | |
| # Calculate velocity (TODO: this is the simple last - first method) | |
| wide_gdf.loc[:, "velocity"] = ( | |
| gdf.groupby(["x", "y"])["displacement"].apply( | |
| lambda x: (x.iloc[-1] - x.iloc[0]) / (len(x) - 1) | |
| ) | |
| * 365.25 | |
| ) | |
| # Reset the index to make x, y, and geometry regular columns again | |
| wide_gdf = gpd.GeoDataFrame(wide_gdf.reset_index(), geometry="geometry") | |
| # Write to GeoParquet | |
| wide_gdf.to_parquet(output_file, index=False) | |
| print(f"GeoParquet written to: {output_file}") | |
| return output_file | |
| def main(): | |
| """Command-line interface for create_geoparquet using tyro.""" | |
| tyro.cli(create_geoparquet) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment