Skip to content

Instantly share code, notes, and snippets.

@pbaylis
Last active February 27, 2022 02:16
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save pbaylis/7a19861b4e4544c9eef4 to your computer and use it in GitHub Desktop.
Save pbaylis/7a19861b4e4544c9eef4 to your computer and use it in GitHub Desktop.
Sample R spatial interpolation code. First fills in missing obs using CDF method, then interpolates to a grid. Useful for converting weather station to gridded data. Uses IDW but that could easily be changed.
library(foreign)
library(data.table)
library(zoo)
library(Hmisc)
library(gstat)
library(sp)
rm(list=ls())
# Fill in missing data ----
sm.thresh <- 0.9 # Min proportion of obs that must be in a station-month
# Load station data, extract yearmonth
station.dt <- data.table(read.csv("station_data.csv"))
station.dt$date <- as.Date(station.dt$date, "%m/%d/%y")
station.dt$lng <- round(station.dt$lng, digits=6) # For rounding errors
station.dt$lat <- round(station.dt$lat, digits=6)
# Create a new station ID
stations <- as.data.table(unique(cbind(station.dt$station_id, station.dt$lng, station.dt$lat)))
names(stations) <- c("station_id", "lng", "lat")
stations$dup <- duplicated(stations$station_id)
stations[, add.str:=cumsum(dup), by=c("station_id")]
stations$station.id <- paste(as.character(stations$station_id), as.character(stations$add.str), sep="_")
stations$dup <- NULL
stations$add.str <- NULL
station.dt <- merge(station.dt, stations, by=c("station_id", "lng", "lat"))
names(stations)[1] <- "old.station.id" # Rename old station id
station.dt <- subset(station.dt, select=c("station.id", "date", "z")) # Keep only important vars
# Create full dataset, fill in missing.
full.grid <- expand.grid(unique(stations$station.id), seq(min(station.dt$date), max(station.dt$date), by=1))
names(full.grid) <- c("station.id", "date")
full.grid <- merge(full.grid, stations, by=c("station.id")) # Re-add station lnglat
station.dt <- merge(station.dt, full.grid, by=c("station.id", "date"), all=TRUE)
station.dt$ym <- as.yearmon(station.dt$date)
station.dt$days.in.month <- monthDays(station.dt$date)
# Remove if station-month contains >= 10% missing data
station.dt <- station.dt[,obs:=sum(!is.na(z)), by = list(station.id, ym)]
station.dt <- station.dt[station.dt$obs / station.dt$days.in.month > sm.thresh, ]
# Create a separate ECDF for each station-month, then compute probability Z <= z
# ecdf(z): create empirical cdf from z, (z): return prob Z <= z for z
station.dt <- station.dt[,pct:=ecdf(z)(z), by = list(station.id, ym)]
### Interpolate over every day to get missing probabilities z
# Subset to a single date (eventually, do this in a loop or fxn
interpolate.bydate <- function(dt) {
# Split into obs with and without data
dt$order <- 1:nrow(dt)
dt$pct.int <- NA
stations.havedata <- dt[!is.na(z)]
stations.nodata <- dt[is.na(z)]
# Turn station data into a spatial data frame
# Note: Data is not projected yet.
# Do I need to project it? I think gstat uses GC distance on unprojected data
if (nrow(stations.havedata) > 0 & nrow(stations.nodata) > 0) {
print("Running IDW")
coordinates(stations.havedata) = ~ lng + lat
coordinates(stations.nodata) = ~ lng + lat
# Compute IDW for missing data by day
idw <- idw(formula = pct ~ 1, locations = stations.havedata, newdata = stations.nodata)
stations.nodata$pct.int <- idw@data$var1.pred
dt <- rbind(stations.nodata, stations.havedata)
dt <- dt[order(dt$order),]
}
return(as.numeric(dt$pct.int))
}
station.dt <- station.dt[, pct.int:=interpolate.bydate(.SD), by="date"]
# By station-month, estimate z
station.dt <- station.dt[, z.int:=quantile(z,pct.int, na.rm=TRUE), by=c("station.id", "ym")]
# Recombine
station.dt$z <- ifelse(is.na(station.dt$z), station.dt$z.int, station.dt$z)
# Check if data is complete. If not... delete station-month?
# Interpolate to grid by date.
grid.dt <- as.data.table(read.csv("grid_data.csv"))
interpolate.togrid.bydate <- function(from, to) {
# Coerce station data into a spatial data frame
# Note: Data is not projected yet.
# Do I need to project it? I think gstat uses GC distance on unprojected data
if (nrow(from) > 0 & nrow(to) > 0) {
coordinates(from) = ~ lng + lat
coordinates(to) = ~ lng + lat
# Compute IDW for missing data by day
idw <- idw(formula = z ~ 1, locations = from, newdata = to)
to$z.int <- idw@data$var1.pred
}
to$date <- from$date[1] # Date is always the same
return(as.data.table(to))
}
# Interpolate to the grid
results <- by(station.dt, INDICES=station.dt$date, FUN=interpolate.togrid.bydate, grid.dt)
grid.interp.dt <- rbindlist(results)
# Plotting
# Take averages
station.averages <- station.dt[, list(z.mean=mean(z,na.rm=T)), by=c("station.id", "lng", "lat")]
grid.averages <- grid.interp.dt[, list(z.mean=mean(z.int, na.rm=T)), by=c("gridcell", "lng", "lat")]
# Plot the station averages as individual points, and the grid cell averages as cells.
# Possible plotting packages:
# splot
# ggplot2
# ggplot solution, use geom_point geom_raster
p <- ggplot() +
geom_raster(data=grid.averages, aes(x=lng, y=lat, fill=z.mean)) +
geom_point(data=station.averages, aes(x=lng, y=lat), colour="white", size=5, shape=3)
p
station_id lng lat date z
1 -99.2 40.1 1/1/15 30
1 -99.2 40.1 1/2/15 35
1 -99.2 40.1 1/3/15 30
1 -99.2 40.1 1/4/15 40
1 -99.2 40.1 1/5/15 30
1 -99.2 40.1 1/6/15 40
1 -99.2 40.1 1/7/15 30
1 -99.2 40.1 1/8/15 35
1 -99.2 40.1 1/9/15 30
1 -99.2 40.1 1/10/15 40
1 -99.2 40.1 1/11/15 30
1 -99.2 40.1 1/12/15 40
1 -99.2 40.1 1/13/15 30
1 -99.2 40.1 1/14/15 35
1 -99.2 40.1 1/15/15 30
1 -99.2 40.1 1/16/15 40
1 -99.2 40.1 1/17/15 30
1 -99.2 40.1 1/18/15 40
1 -99.2 40.1 1/19/15 30
1 -99.2 40.1 1/20/15 35
1 -99.2 40.1 1/21/15 30
1 -99.2 40.1 1/22/15 40
1 -99.2 40.1 1/24/15 40
1 -99.2 40.1 1/25/15 30
1 -99.2 40.1 1/26/15 35
1 -99.2 40.1 1/27/15 30
1 -99.2 40.1 1/28/15 40
1 -99.2 40.1 1/29/15 30
1 -99.2 40.1 1/30/15 40
1 -99.2 40.1 1/31/15 20
1 -99.2 40.1 2/1/15
1 -99.2 40.1 2/2/15 40
2 -98.6 39.3 1/1/15 25
2 -98.6 39.3 1/2/15 30
2 -98.6 39.3 1/3/15 23
2 -98.6 39.3 1/4/15 22
2 -98.6 39.3 1/5/15 14
2 -98.6 39.3 1/6/15 23
2 -98.6 39.3 1/7/15 93
2 -98.6 39.3 1/8/15 23
2 -98.6 39.3 1/9/15 4
2 -98.6 39.3 1/10/15 53
2 -98.6 39.3 1/11/15 39
2 -98.6 39.3 1/12/15 30
2 -98.6 39.3 1/13/15 23
2 -98.6 39.3 1/14/15 22
2 -98.6 39.3 1/15/15 14
2 -98.6 39.3 1/16/15 23
2 -98.6 39.3 1/17/15 93
2 -98.6 39.3 1/18/15 23
2 -98.6 39.3 1/19/15 4
2 -98.6 39.3 1/20/15 53
2 -98.6 39.3 1/21/15 39
2 -98.6 39.3 1/22/15 30
2 -98.6 39.3 1/23/15 34
2 -98.6 39.3 1/25/15 22
2 -98.6 39.3 1/26/15 14
2 -98.6 39.3 1/27/15 23
2 -98.6 39.3 1/28/15 93
2 -98.6 39.3 1/29/15 23
2 -98.6 39.3 1/30/15 4
2 -98.6 39.3 1/31/15 53
2 -98.6 39.3 2/1/15 90
2 -98.6 39.3 2/2/15 95
2 1 1 1/1/15 30
2 2 2 1/1/15 30
gridcell lng lat
1 -100 40
2 -99 40
3 -100 39
4 -99 39
@vanyaliide
Copy link

Hi,
I am trying to reproduce your work. However, when I am trying to increase the number of grids, I am getting an error like this:

station.dt <- station.dt[,pct:=ecdf(z)(z), by = list(station.id, ym)]
Error in ecdf(z) : 'x' must have 1 or more non-missing values
My data looks like this:

<style> </style>
station_id lng lat date z
1 -99.2 40.1 ######## 30
1 -99.2 40.1 ######## 35
1 -99.2 40.1 ######## 30
1 -99.2 40.1 ######## 40
1 -99.2 40.1 ######## 30
1 -99.2 40.1 ######## 40
1 -99.2 40.1 ######## 30
1 -99.2 40.1 ######## 35
1 -99.2 40.1 ######## 30
1 -99.2 40.1 ######## 40
1 -99.2 40.1 ######## 30
1 -99.2 40.1 ######## 40
1 -99.2 40.1 1/13/15 30
1 -99.2 40.1 1/14/15 35
1 -99.2 40.1 1/15/15 30
1 -99.2 40.1 1/16/15 40
1 -99.2 40.1 1/17/15 30
1 -99.2 40.1 1/18/15 40
1 -99.2 40.1 1/19/15 30
1 -99.2 40.1 1/20/15 35
1 -99.2 40.1 1/21/15 30
1 -99.2 40.1 1/22/15 40
1 -99.2 40.1 1/24/15 40
1 -99.2 40.1 1/25/15 30
1 -99.2 40.1 1/26/15 35
1 -99.2 40.1 1/27/15 30
1 -99.2 40.1 1/28/15 40
1 -99.2 40.1 1/29/15 30
1 -99.2 40.1 1/30/15 40
1 -99.2 40.1 1/31/15 20
1 -99.2 40.1 ######## 30
1 -99.2 40.1 ######## 40
2 -98.6 39.3 ######## 25
2 -98.6 39.3 ######## 30
2 -98.6 39.3 ######## 23
2 -98.6 39.3 ######## 22
2 -98.6 39.3 ######## 14
2 -98.6 39.3 ######## 23
2 -98.6 39.3 ######## 93
2 -98.6 39.3 ######## 23
2 -98.6 39.3 ######## 4
2 -98.6 39.3 ######## 53
2 -98.6 39.3 ######## 39
2 -98.6 39.3 ######## 30
2 -98.6 39.3 1/13/15 23
2 -98.6 39.3 1/14/15 22
2 -98.6 39.3 1/15/15 14
2 -98.6 39.3 1/16/15 23
2 -98.6 39.3 1/17/15 93
2 -98.6 39.3 1/18/15 23
2 -98.6 39.3 1/19/15 4
2 -98.6 39.3 1/20/15 53
2 -98.6 39.3 1/21/15 39
2 -98.6 39.3 1/22/15 30
2 -98.6 39.3 1/23/15 34
2 -98.6 39.3 1/25/15 22
2 -98.6 39.3 1/26/15 14
2 -98.6 39.3 1/27/15 23
2 -98.6 39.3 1/28/15 93
2 -98.6 39.3 1/29/15 23
2 -98.6 39.3 1/30/15 4
2 -98.6 39.3 1/31/15 53
2 -98.6 39.3 ######## 90
2 -98.6 39.3 ######## 95
3 -98.1 38.3 1/13/15 23
3 -98.1 38.3 1/14/15 22
3 -98.1 38.3 1/15/15 14
3 -98.1 38.3 1/16/15 23
3 -98.1 38.3 1/17/15 93
3 -98.1 38.3 1/18/15 23
3 -98.1 38.3 1/19/15 4
3 -98.1 38.3 1/20/15 53
3 -98.1 38.3 1/21/15 39
3 -98.1 38.3 1/22/15 30
3 -98.1 38.3 1/23/15 34
3 -98.1 38.3 1/25/15 22
3 -98.1 38.3 1/26/15 14
3 -98.1 38.3 1/27/15 23
3 -98.1 38.3 1/28/15 93
3 -98.1 38.3 1/29/15 23
3 -98.1 38.3 1/30/15 4
3 -98.1 38.3 1/31/15 53
3 -98.1 38.3 ######## 90
3 -98.1 38.3 ######## 95
3 1 1 ######## 30
3 2 2 ######## 30
3 3 3 ######## 27
My grid data file looks like this: <style> </style>
gridcell lng lat
1 -100 40
2 -99 40
3 -98 40
4 -100 39
5 -99 39
6 -98 39
7 -100 38
8 -99 38
9 -98 38

Where am I going wrong?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment