Last active
February 27, 2022 02:16
-
-
Save pbaylis/7a19861b4e4544c9eef4 to your computer and use it in GitHub Desktop.
Sample R spatial interpolation code. First fills in missing obs using CDF method, then interpolates to a grid. Useful for converting weather station to gridded data. Uses IDW but that could easily be changed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(foreign) | |
library(data.table) | |
library(zoo) | |
library(Hmisc) | |
library(gstat) | |
library(sp) | |
rm(list=ls()) | |
# Fill in missing data ---- | |
sm.thresh <- 0.9 # Min proportion of obs that must be in a station-month | |
# Load station data, extract yearmonth | |
station.dt <- data.table(read.csv("station_data.csv")) | |
station.dt$date <- as.Date(station.dt$date, "%m/%d/%y") | |
station.dt$lng <- round(station.dt$lng, digits=6) # For rounding errors | |
station.dt$lat <- round(station.dt$lat, digits=6) | |
# Create a new station ID | |
stations <- as.data.table(unique(cbind(station.dt$station_id, station.dt$lng, station.dt$lat))) | |
names(stations) <- c("station_id", "lng", "lat") | |
stations$dup <- duplicated(stations$station_id) | |
stations[, add.str:=cumsum(dup), by=c("station_id")] | |
stations$station.id <- paste(as.character(stations$station_id), as.character(stations$add.str), sep="_") | |
stations$dup <- NULL | |
stations$add.str <- NULL | |
station.dt <- merge(station.dt, stations, by=c("station_id", "lng", "lat")) | |
names(stations)[1] <- "old.station.id" # Rename old station id | |
station.dt <- subset(station.dt, select=c("station.id", "date", "z")) # Keep only important vars | |
# Create full dataset, fill in missing. | |
full.grid <- expand.grid(unique(stations$station.id), seq(min(station.dt$date), max(station.dt$date), by=1)) | |
names(full.grid) <- c("station.id", "date") | |
full.grid <- merge(full.grid, stations, by=c("station.id")) # Re-add station lnglat | |
station.dt <- merge(station.dt, full.grid, by=c("station.id", "date"), all=TRUE) | |
station.dt$ym <- as.yearmon(station.dt$date) | |
station.dt$days.in.month <- monthDays(station.dt$date) | |
# Remove if station-month contains >= 10% missing data | |
station.dt <- station.dt[,obs:=sum(!is.na(z)), by = list(station.id, ym)] | |
station.dt <- station.dt[station.dt$obs / station.dt$days.in.month > sm.thresh, ] | |
# Create a separate ECDF for each station-month, then compute probability Z <= z | |
# ecdf(z): create empirical cdf from z, (z): return prob Z <= z for z | |
station.dt <- station.dt[,pct:=ecdf(z)(z), by = list(station.id, ym)] | |
### Interpolate over every day to get missing probabilities z | |
# Subset to a single date (eventually, do this in a loop or fxn | |
interpolate.bydate <- function(dt) { | |
# Split into obs with and without data | |
dt$order <- 1:nrow(dt) | |
dt$pct.int <- NA | |
stations.havedata <- dt[!is.na(z)] | |
stations.nodata <- dt[is.na(z)] | |
# Turn station data into a spatial data frame | |
# Note: Data is not projected yet. | |
# Do I need to project it? I think gstat uses GC distance on unprojected data | |
if (nrow(stations.havedata) > 0 & nrow(stations.nodata) > 0) { | |
print("Running IDW") | |
coordinates(stations.havedata) = ~ lng + lat | |
coordinates(stations.nodata) = ~ lng + lat | |
# Compute IDW for missing data by day | |
idw <- idw(formula = pct ~ 1, locations = stations.havedata, newdata = stations.nodata) | |
stations.nodata$pct.int <- idw@data$var1.pred | |
dt <- rbind(stations.nodata, stations.havedata) | |
dt <- dt[order(dt$order),] | |
} | |
return(as.numeric(dt$pct.int)) | |
} | |
station.dt <- station.dt[, pct.int:=interpolate.bydate(.SD), by="date"] | |
# By station-month, estimate z | |
station.dt <- station.dt[, z.int:=quantile(z,pct.int, na.rm=TRUE), by=c("station.id", "ym")] | |
# Recombine | |
station.dt$z <- ifelse(is.na(station.dt$z), station.dt$z.int, station.dt$z) | |
# Check if data is complete. If not... delete station-month? | |
# Interpolate to grid by date. | |
grid.dt <- as.data.table(read.csv("grid_data.csv")) | |
interpolate.togrid.bydate <- function(from, to) { | |
# Coerce station data into a spatial data frame | |
# Note: Data is not projected yet. | |
# Do I need to project it? I think gstat uses GC distance on unprojected data | |
if (nrow(from) > 0 & nrow(to) > 0) { | |
coordinates(from) = ~ lng + lat | |
coordinates(to) = ~ lng + lat | |
# Compute IDW for missing data by day | |
idw <- idw(formula = z ~ 1, locations = from, newdata = to) | |
to$z.int <- idw@data$var1.pred | |
} | |
to$date <- from$date[1] # Date is always the same | |
return(as.data.table(to)) | |
} | |
# Interpolate to the grid | |
results <- by(station.dt, INDICES=station.dt$date, FUN=interpolate.togrid.bydate, grid.dt) | |
grid.interp.dt <- rbindlist(results) | |
# Plotting | |
# Take averages | |
station.averages <- station.dt[, list(z.mean=mean(z,na.rm=T)), by=c("station.id", "lng", "lat")] | |
grid.averages <- grid.interp.dt[, list(z.mean=mean(z.int, na.rm=T)), by=c("gridcell", "lng", "lat")] | |
# Plot the station averages as individual points, and the grid cell averages as cells. | |
# Possible plotting packages: | |
# splot | |
# ggplot2 | |
# ggplot solution, use geom_point geom_raster | |
p <- ggplot() + | |
geom_raster(data=grid.averages, aes(x=lng, y=lat, fill=z.mean)) + | |
geom_point(data=station.averages, aes(x=lng, y=lat), colour="white", size=5, shape=3) | |
p | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
station_id | lng | lat | date | z | |
---|---|---|---|---|---|
1 | -99.2 | 40.1 | 1/1/15 | 30 | |
1 | -99.2 | 40.1 | 1/2/15 | 35 | |
1 | -99.2 | 40.1 | 1/3/15 | 30 | |
1 | -99.2 | 40.1 | 1/4/15 | 40 | |
1 | -99.2 | 40.1 | 1/5/15 | 30 | |
1 | -99.2 | 40.1 | 1/6/15 | 40 | |
1 | -99.2 | 40.1 | 1/7/15 | 30 | |
1 | -99.2 | 40.1 | 1/8/15 | 35 | |
1 | -99.2 | 40.1 | 1/9/15 | 30 | |
1 | -99.2 | 40.1 | 1/10/15 | 40 | |
1 | -99.2 | 40.1 | 1/11/15 | 30 | |
1 | -99.2 | 40.1 | 1/12/15 | 40 | |
1 | -99.2 | 40.1 | 1/13/15 | 30 | |
1 | -99.2 | 40.1 | 1/14/15 | 35 | |
1 | -99.2 | 40.1 | 1/15/15 | 30 | |
1 | -99.2 | 40.1 | 1/16/15 | 40 | |
1 | -99.2 | 40.1 | 1/17/15 | 30 | |
1 | -99.2 | 40.1 | 1/18/15 | 40 | |
1 | -99.2 | 40.1 | 1/19/15 | 30 | |
1 | -99.2 | 40.1 | 1/20/15 | 35 | |
1 | -99.2 | 40.1 | 1/21/15 | 30 | |
1 | -99.2 | 40.1 | 1/22/15 | 40 | |
1 | -99.2 | 40.1 | 1/24/15 | 40 | |
1 | -99.2 | 40.1 | 1/25/15 | 30 | |
1 | -99.2 | 40.1 | 1/26/15 | 35 | |
1 | -99.2 | 40.1 | 1/27/15 | 30 | |
1 | -99.2 | 40.1 | 1/28/15 | 40 | |
1 | -99.2 | 40.1 | 1/29/15 | 30 | |
1 | -99.2 | 40.1 | 1/30/15 | 40 | |
1 | -99.2 | 40.1 | 1/31/15 | 20 | |
1 | -99.2 | 40.1 | 2/1/15 | ||
1 | -99.2 | 40.1 | 2/2/15 | 40 | |
2 | -98.6 | 39.3 | 1/1/15 | 25 | |
2 | -98.6 | 39.3 | 1/2/15 | 30 | |
2 | -98.6 | 39.3 | 1/3/15 | 23 | |
2 | -98.6 | 39.3 | 1/4/15 | 22 | |
2 | -98.6 | 39.3 | 1/5/15 | 14 | |
2 | -98.6 | 39.3 | 1/6/15 | 23 | |
2 | -98.6 | 39.3 | 1/7/15 | 93 | |
2 | -98.6 | 39.3 | 1/8/15 | 23 | |
2 | -98.6 | 39.3 | 1/9/15 | 4 | |
2 | -98.6 | 39.3 | 1/10/15 | 53 | |
2 | -98.6 | 39.3 | 1/11/15 | 39 | |
2 | -98.6 | 39.3 | 1/12/15 | 30 | |
2 | -98.6 | 39.3 | 1/13/15 | 23 | |
2 | -98.6 | 39.3 | 1/14/15 | 22 | |
2 | -98.6 | 39.3 | 1/15/15 | 14 | |
2 | -98.6 | 39.3 | 1/16/15 | 23 | |
2 | -98.6 | 39.3 | 1/17/15 | 93 | |
2 | -98.6 | 39.3 | 1/18/15 | 23 | |
2 | -98.6 | 39.3 | 1/19/15 | 4 | |
2 | -98.6 | 39.3 | 1/20/15 | 53 | |
2 | -98.6 | 39.3 | 1/21/15 | 39 | |
2 | -98.6 | 39.3 | 1/22/15 | 30 | |
2 | -98.6 | 39.3 | 1/23/15 | 34 | |
2 | -98.6 | 39.3 | 1/25/15 | 22 | |
2 | -98.6 | 39.3 | 1/26/15 | 14 | |
2 | -98.6 | 39.3 | 1/27/15 | 23 | |
2 | -98.6 | 39.3 | 1/28/15 | 93 | |
2 | -98.6 | 39.3 | 1/29/15 | 23 | |
2 | -98.6 | 39.3 | 1/30/15 | 4 | |
2 | -98.6 | 39.3 | 1/31/15 | 53 | |
2 | -98.6 | 39.3 | 2/1/15 | 90 | |
2 | -98.6 | 39.3 | 2/2/15 | 95 | |
2 | 1 | 1 | 1/1/15 | 30 | |
2 | 2 | 2 | 1/1/15 | 30 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
gridcell | lng | lat | |
---|---|---|---|
1 | -100 | 40 | |
2 | -99 | 40 | |
3 | -100 | 39 | |
4 | -99 | 39 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi,
<style> </style>I am trying to reproduce your work. However, when I am trying to increase the number of grids, I am getting an error like this:
Where am I going wrong?