pbaylis/00_spatial_interpolation.R

## 00_spatial_interpolation.R
library(foreign)
library(data.table)
library(zoo)
library(Hmisc)
library(gstat)
library(sp)
rm(list=ls())

# Fill in missing data ----
sm.thresh <- 0.9 # Min proportion of obs that must be in a station-month

# Load station data, extract yearmonth
station.dt <- data.table(read.csv("station_data.csv"))
station.dt$date <- as.Date(station.dt$date, "%m/%d/%y")
station.dt$lng <- round(station.dt$lng, digits=6) # For rounding errors
station.dt$lat <- round(station.dt$lat, digits=6)

# Create a new station ID
stations <- as.data.table(unique(cbind(station.dt$station_id, station.dt$lng, station.dt$lat)))
names(stations) <- c("station_id", "lng", "lat")
stations$dup <- duplicated(stations$station_id)
stations[, add.str:=cumsum(dup), by=c("station_id")]
stations$station.id <- paste(as.character(stations$station_id), as.character(stations$add.str), sep="_")
stations$dup <- NULL
stations$add.str <- NULL
station.dt <- merge(station.dt, stations, by=c("station_id", "lng", "lat"))
names(stations)[1] <- "old.station.id" # Rename old station id
station.dt <- subset(station.dt, select=c("station.id", "date", "z")) # Keep only important vars

# Create full dataset, fill in missing.
full.grid <- expand.grid(unique(stations$station.id), seq(min(station.dt$date), max(station.dt$date), by=1))
names(full.grid) <- c("station.id", "date")
full.grid <- merge(full.grid, stations, by=c("station.id")) # Re-add station lnglat
station.dt <- merge(station.dt, full.grid, by=c("station.id", "date"), all=TRUE)

station.dt$ym <- as.yearmon(station.dt$date)
station.dt$days.in.month <- monthDays(station.dt$date)

# Remove if station-month contains >= 10% missing data
station.dt <- station.dt[,obs:=sum(!is.na(z)), by = list(station.id, ym)]
station.dt <- station.dt[station.dt$obs / station.dt$days.in.month > sm.thresh, ]

# Create a separate ECDF for each station-month, then compute probability Z <= z
# ecdf(z): create empirical cdf from z, (z): return prob Z <= z for z
station.dt <- station.dt[,pct:=ecdf(z)(z), by = list(station.id, ym)]

### Interpolate over every day to get missing probabilities z
# Subset to a single date (eventually, do this in a loop or fxn

interpolate.bydate <- function(dt) {
  # Split into obs with and without data
  dt$order <- 1:nrow(dt)
  dt$pct.int <- NA
  stations.havedata <- dt[!is.na(z)]
  stations.nodata <- dt[is.na(z)]

  # Turn station data into a spatial data frame
  # Note: Data is not projected yet.
  # Do I need to project it? I think gstat uses GC distance on unprojected data
  if (nrow(stations.havedata) > 0 & nrow(stations.nodata) > 0) {
    print("Running IDW")
    coordinates(stations.havedata) = ~ lng + lat
    coordinates(stations.nodata) = ~ lng + lat

    # Compute IDW for missing data by day
    idw <- idw(formula = pct ~ 1, locations = stations.havedata, newdata = stations.nodata)
    stations.nodata$pct.int <- idw@data$var1.pred

    dt <- rbind(stations.nodata, stations.havedata)
    dt <- dt[order(dt$order),]
  }
  return(as.numeric(dt$pct.int))
}

station.dt <- station.dt[, pct.int:=interpolate.bydate(.SD), by="date"]

# By station-month, estimate z
station.dt <- station.dt[, z.int:=quantile(z,pct.int, na.rm=TRUE), by=c("station.id", "ym")]

# Recombine
station.dt$z <- ifelse(is.na(station.dt$z), station.dt$z.int, station.dt$z)

# Check if data is complete. If not... delete station-month?

# Interpolate to grid by date.
grid.dt <- as.data.table(read.csv("grid_data.csv"))

interpolate.togrid.bydate <- function(from, to) {
  # Coerce station data into a spatial data frame
  # Note: Data is not projected yet.
  # Do I need to project it? I think gstat uses GC distance on unprojected data
  if (nrow(from) > 0 & nrow(to) > 0) {
    coordinates(from) = ~ lng + lat
    coordinates(to) = ~ lng + lat

    # Compute IDW for missing data by day
    idw <- idw(formula = z ~ 1, locations = from, newdata = to)
    to$z.int <- idw@data$var1.pred
  }
  to$date <- from$date[1] # Date is always the same
  return(as.data.table(to))
}

# Interpolate to the grid
results <- by(station.dt, INDICES=station.dt$date, FUN=interpolate.togrid.bydate, grid.dt)
grid.interp.dt <- rbindlist(results)

# Plotting
# Take averages
station.averages <- station.dt[, list(z.mean=mean(z,na.rm=T)), by=c("station.id", "lng", "lat")]
grid.averages <- grid.interp.dt[, list(z.mean=mean(z.int, na.rm=T)), by=c("gridcell", "lng", "lat")]

# Plot the station averages as individual points, and the grid cell averages as cells.
# Possible plotting packages:
# splot
# ggplot2

# ggplot solution, use geom_point geom_raster
p <- ggplot() +
  geom_raster(data=grid.averages, aes(x=lng, y=lat, fill=z.mean)) +
  geom_point(data=station.averages, aes(x=lng, y=lat), colour="white", size=5, shape=3)

p


## 01_station_data.csv

          
            station_id
            lng
            lat
            date
            z

            
              1
              -99.2
              40.1
              1/1/15
              30

            
              1
              -99.2
              40.1
              1/2/15
              35

            
              1
              -99.2
              40.1
              1/3/15
              30

            
              1
              -99.2
              40.1
              1/4/15
              40

            
              1
              -99.2
              40.1
              1/5/15
              30

            
              1
              -99.2
              40.1
              1/6/15
              40

            
              1
              -99.2
              40.1
              1/7/15
              30

            
              1
              -99.2
              40.1
              1/8/15
              35

            
              1
              -99.2
              40.1
              1/9/15
              30

            
              1
              -99.2
              40.1
              1/10/15
              40

            
              1
              -99.2
              40.1
              1/11/15
              30

            
              1
              -99.2
              40.1
              1/12/15
              40

            
              1
              -99.2
              40.1
              1/13/15
              30

            
              1
              -99.2
              40.1
              1/14/15
              35

            
              1
              -99.2
              40.1
              1/15/15
              30

            
              1
              -99.2
              40.1
              1/16/15
              40

            
              1
              -99.2
              40.1
              1/17/15
              30

            
              1
              -99.2
              40.1
              1/18/15
              40

            
              1
              -99.2
              40.1
              1/19/15
              30

            
              1
              -99.2
              40.1
              1/20/15
              35

            
              1
              -99.2
              40.1
              1/21/15
              30

            
              1
              -99.2
              40.1
              1/22/15
              40

            
              1
              -99.2
              40.1
              1/24/15
              40

            
              1
              -99.2
              40.1
              1/25/15
              30

            
              1
              -99.2
              40.1
              1/26/15
              35

            
              1
              -99.2
              40.1
              1/27/15
              30

            
              1
              -99.2
              40.1
              1/28/15
              40

            
              1
              -99.2
              40.1
              1/29/15
              30

            
              1
              -99.2
              40.1
              1/30/15
              40

            
              1
              -99.2
              40.1
              1/31/15
              20

            
              1
              -99.2
              40.1
              2/1/15

            
              1
              -99.2
              40.1
              2/2/15
              40

            
              2
              -98.6
              39.3
              1/1/15
              25

            
              2
              -98.6
              39.3
              1/2/15
              30

            
              2
              -98.6
              39.3
              1/3/15
              23

            
              2
              -98.6
              39.3
              1/4/15
              22

            
              2
              -98.6
              39.3
              1/5/15
              14

            
              2
              -98.6
              39.3
              1/6/15
              23

            
              2
              -98.6
              39.3
              1/7/15
              93

            
              2
              -98.6
              39.3
              1/8/15
              23

            
              2
              -98.6
              39.3
              1/9/15
              4

            
              2
              -98.6
              39.3
              1/10/15
              53

            
              2
              -98.6
              39.3
              1/11/15
              39

            
              2
              -98.6
              39.3
              1/12/15
              30

            
              2
              -98.6
              39.3
              1/13/15
              23

            
              2
              -98.6
              39.3
              1/14/15
              22

            
              2
              -98.6
              39.3
              1/15/15
              14

            
              2
              -98.6
              39.3
              1/16/15
              23

            
              2
              -98.6
              39.3
              1/17/15
              93

            
              2
              -98.6
              39.3
              1/18/15
              23

            
              2
              -98.6
              39.3
              1/19/15
              4

            
              2
              -98.6
              39.3
              1/20/15
              53

            
              2
              -98.6
              39.3
              1/21/15
              39

            
              2
              -98.6
              39.3
              1/22/15
              30

            
              2
              -98.6
              39.3
              1/23/15
              34

            
              2
              -98.6
              39.3
              1/25/15
              22

            
              2
              -98.6
              39.3
              1/26/15
              14

            
              2
              -98.6
              39.3
              1/27/15
              23

            
              2
              -98.6
              39.3
              1/28/15
              93

            
              2
              -98.6
              39.3
              1/29/15
              23

            
              2
              -98.6
              39.3
              1/30/15
              4

            
              2
              -98.6
              39.3
              1/31/15
              53

            
              2
              -98.6
              39.3
              2/1/15
              90

            
              2
              -98.6
              39.3
              2/2/15
              95

            
              2
              1
              1
              1/1/15
              30

            
              2
              2
              2
              1/1/15
              30

## 02_grid_data.csv

          
            gridcell
            lng
            lat

            
              1
              -100
              40

            
              2
              -99
              40

            
              3
              -100
              39

            
              4
              -99
              39
	library(foreign)
	library(data.table)
	library(zoo)
	library(Hmisc)
	library(gstat)
	library(sp)
	rm(list=ls())

	# Fill in missing data ----
	sm.thresh <- 0.9 # Min proportion of obs that must be in a station-month

	# Load station data, extract yearmonth
	station.dt <- data.table(read.csv("station_data.csv"))
	station.dt$date <- as.Date(station.dt$date, "%m/%d/%y")
	station.dt$lng <- round(station.dt$lng, digits=6) # For rounding errors
	station.dt$lat <- round(station.dt$lat, digits=6)

	# Create a new station ID
	stations <- as.data.table(unique(cbind(station.dt$station_id, station.dt$lng, station.dt$lat)))
	names(stations) <- c("station_id", "lng", "lat")
	stations$dup <- duplicated(stations$station_id)
	stations[, add.str:=cumsum(dup), by=c("station_id")]
	stations$station.id <- paste(as.character(stations$station_id), as.character(stations$add.str), sep="_")
	stations$dup <- NULL
	stations$add.str <- NULL
	station.dt <- merge(station.dt, stations, by=c("station_id", "lng", "lat"))
	names(stations)[1] <- "old.station.id" # Rename old station id
	station.dt <- subset(station.dt, select=c("station.id", "date", "z")) # Keep only important vars

	# Create full dataset, fill in missing.
	full.grid <- expand.grid(unique(stations$station.id), seq(min(station.dt$date), max(station.dt$date), by=1))
	names(full.grid) <- c("station.id", "date")
	full.grid <- merge(full.grid, stations, by=c("station.id")) # Re-add station lnglat
	station.dt <- merge(station.dt, full.grid, by=c("station.id", "date"), all=TRUE)

	station.dt$ym <- as.yearmon(station.dt$date)
	station.dt$days.in.month <- monthDays(station.dt$date)

	# Remove if station-month contains >= 10% missing data
	station.dt <- station.dt[,obs:=sum(!is.na(z)), by = list(station.id, ym)]
	station.dt <- station.dt[station.dt$obs / station.dt$days.in.month > sm.thresh, ]

	# Create a separate ECDF for each station-month, then compute probability Z <= z
	# ecdf(z): create empirical cdf from z, (z): return prob Z <= z for z
	station.dt <- station.dt[,pct:=ecdf(z)(z), by = list(station.id, ym)]

	### Interpolate over every day to get missing probabilities z
	# Subset to a single date (eventually, do this in a loop or fxn

	interpolate.bydate <- function(dt) {
	# Split into obs with and without data
	dt$order <- 1:nrow(dt)
	dt$pct.int <- NA
	stations.havedata <- dt[!is.na(z)]
	stations.nodata <- dt[is.na(z)]

	# Turn station data into a spatial data frame
	# Note: Data is not projected yet.
	# Do I need to project it? I think gstat uses GC distance on unprojected data
	if (nrow(stations.havedata) > 0 & nrow(stations.nodata) > 0) {
	print("Running IDW")
	coordinates(stations.havedata) = ~ lng + lat
	coordinates(stations.nodata) = ~ lng + lat

	# Compute IDW for missing data by day
	idw <- idw(formula = pct ~ 1, locations = stations.havedata, newdata = stations.nodata)
	stations.nodata$pct.int <- idw@data$var1.pred

	dt <- rbind(stations.nodata, stations.havedata)
	dt <- dt[order(dt$order),]
	}
	return(as.numeric(dt$pct.int))
	}

	station.dt <- station.dt[, pct.int:=interpolate.bydate(.SD), by="date"]

	# By station-month, estimate z
	station.dt <- station.dt[, z.int:=quantile(z,pct.int, na.rm=TRUE), by=c("station.id", "ym")]

	# Recombine
	station.dt$z <- ifelse(is.na(station.dt$z), station.dt$z.int, station.dt$z)

	# Check if data is complete. If not... delete station-month?

	# Interpolate to grid by date.
	grid.dt <- as.data.table(read.csv("grid_data.csv"))

	interpolate.togrid.bydate <- function(from, to) {
	# Coerce station data into a spatial data frame
	# Note: Data is not projected yet.
	# Do I need to project it? I think gstat uses GC distance on unprojected data
	if (nrow(from) > 0 & nrow(to) > 0) {
	coordinates(from) = ~ lng + lat
	coordinates(to) = ~ lng + lat

	# Compute IDW for missing data by day
	idw <- idw(formula = z ~ 1, locations = from, newdata = to)
	to$z.int <- idw@data$var1.pred
	}
	to$date <- from$date[1] # Date is always the same
	return(as.data.table(to))
	}

	# Interpolate to the grid
	results <- by(station.dt, INDICES=station.dt$date, FUN=interpolate.togrid.bydate, grid.dt)
	grid.interp.dt <- rbindlist(results)

	# Plotting
	# Take averages
	station.averages <- station.dt[, list(z.mean=mean(z,na.rm=T)), by=c("station.id", "lng", "lat")]
	grid.averages <- grid.interp.dt[, list(z.mean=mean(z.int, na.rm=T)), by=c("gridcell", "lng", "lat")]

	# Plot the station averages as individual points, and the grid cell averages as cells.
	# Possible plotting packages:
	# splot
	# ggplot2

	# ggplot solution, use geom_point geom_raster
	p <- ggplot() +
	geom_raster(data=grid.averages, aes(x=lng, y=lat, fill=z.mean)) +
	geom_point(data=station.averages, aes(x=lng, y=lat), colour="white", size=5, shape=3)

	p
station_id	lng	lat	date	z
1	-99.2	40.1	1/1/15	30
1	-99.2	40.1	1/2/15	35
1	-99.2	40.1	1/3/15	30
1	-99.2	40.1	1/4/15	40
1	-99.2	40.1	1/5/15	30
1	-99.2	40.1	1/6/15	40
1	-99.2	40.1	1/7/15	30
1	-99.2	40.1	1/8/15	35
1	-99.2	40.1	1/9/15	30
1	-99.2	40.1	1/10/15	40
1	-99.2	40.1	1/11/15	30
1	-99.2	40.1	1/12/15	40
1	-99.2	40.1	1/13/15	30
1	-99.2	40.1	1/14/15	35
1	-99.2	40.1	1/15/15	30
1	-99.2	40.1	1/16/15	40
1	-99.2	40.1	1/17/15	30
1	-99.2	40.1	1/18/15	40
1	-99.2	40.1	1/19/15	30
1	-99.2	40.1	1/20/15	35
1	-99.2	40.1	1/21/15	30
1	-99.2	40.1	1/22/15	40
1	-99.2	40.1	1/24/15	40
1	-99.2	40.1	1/25/15	30
1	-99.2	40.1	1/26/15	35
1	-99.2	40.1	1/27/15	30
1	-99.2	40.1	1/28/15	40
1	-99.2	40.1	1/29/15	30
1	-99.2	40.1	1/30/15	40
1	-99.2	40.1	1/31/15	20
1	-99.2	40.1	2/1/15
1	-99.2	40.1	2/2/15	40
2	-98.6	39.3	1/1/15	25
2	-98.6	39.3	1/2/15	30
2	-98.6	39.3	1/3/15	23
2	-98.6	39.3	1/4/15	22
2	-98.6	39.3	1/5/15	14
2	-98.6	39.3	1/6/15	23
2	-98.6	39.3	1/7/15	93
2	-98.6	39.3	1/8/15	23
2	-98.6	39.3	1/9/15	4
2	-98.6	39.3	1/10/15	53
2	-98.6	39.3	1/11/15	39
2	-98.6	39.3	1/12/15	30
2	-98.6	39.3	1/13/15	23
2	-98.6	39.3	1/14/15	22
2	-98.6	39.3	1/15/15	14
2	-98.6	39.3	1/16/15	23
2	-98.6	39.3	1/17/15	93
2	-98.6	39.3	1/18/15	23
2	-98.6	39.3	1/19/15	4
2	-98.6	39.3	1/20/15	53
2	-98.6	39.3	1/21/15	39
2	-98.6	39.3	1/22/15	30
2	-98.6	39.3	1/23/15	34
2	-98.6	39.3	1/25/15	22
2	-98.6	39.3	1/26/15	14
2	-98.6	39.3	1/27/15	23
2	-98.6	39.3	1/28/15	93
2	-98.6	39.3	1/29/15	23
2	-98.6	39.3	1/30/15	4
2	-98.6	39.3	1/31/15	53
2	-98.6	39.3	2/1/15	90
2	-98.6	39.3	2/2/15	95
2	1	1	1/1/15	30
2	2	2	1/1/15	30
station_id	lng	lat	date	z
1	-99.2	40.1	########	30
1	-99.2	40.1	########	35
1	-99.2	40.1	########	30
1	-99.2	40.1	########	40
1	-99.2	40.1	########	30
1	-99.2	40.1	########	40
1	-99.2	40.1	########	30
1	-99.2	40.1	########	35
1	-99.2	40.1	########	30
1	-99.2	40.1	########	40
1	-99.2	40.1	########	30
1	-99.2	40.1	########	40
1	-99.2	40.1	1/13/15	30
1	-99.2	40.1	1/14/15	35
1	-99.2	40.1	1/15/15	30
1	-99.2	40.1	1/16/15	40
1	-99.2	40.1	1/17/15	30
1	-99.2	40.1	1/18/15	40
1	-99.2	40.1	1/19/15	30
1	-99.2	40.1	1/20/15	35
1	-99.2	40.1	1/21/15	30
1	-99.2	40.1	1/22/15	40
1	-99.2	40.1	1/24/15	40
1	-99.2	40.1	1/25/15	30
1	-99.2	40.1	1/26/15	35
1	-99.2	40.1	1/27/15	30
1	-99.2	40.1	1/28/15	40
1	-99.2	40.1	1/29/15	30
1	-99.2	40.1	1/30/15	40
1	-99.2	40.1	1/31/15	20
1	-99.2	40.1	########	30
1	-99.2	40.1	########	40
2	-98.6	39.3	########	25
2	-98.6	39.3	########	30
2	-98.6	39.3	########	23
2	-98.6	39.3	########	22
2	-98.6	39.3	########	14
2	-98.6	39.3	########	23
2	-98.6	39.3	########	93
2	-98.6	39.3	########	23
2	-98.6	39.3	########	4
2	-98.6	39.3	########	53
2	-98.6	39.3	########	39
2	-98.6	39.3	########	30
2	-98.6	39.3	1/13/15	23
2	-98.6	39.3	1/14/15	22
2	-98.6	39.3	1/15/15	14
2	-98.6	39.3	1/16/15	23
2	-98.6	39.3	1/17/15	93
2	-98.6	39.3	1/18/15	23
2	-98.6	39.3	1/19/15	4
2	-98.6	39.3	1/20/15	53
2	-98.6	39.3	1/21/15	39
2	-98.6	39.3	1/22/15	30
2	-98.6	39.3	1/23/15	34
2	-98.6	39.3	1/25/15	22
2	-98.6	39.3	1/26/15	14
2	-98.6	39.3	1/27/15	23
2	-98.6	39.3	1/28/15	93
2	-98.6	39.3	1/29/15	23
2	-98.6	39.3	1/30/15	4
2	-98.6	39.3	1/31/15	53
2	-98.6	39.3	########	90
2	-98.6	39.3	########	95
3	-98.1	38.3	1/13/15	23
3	-98.1	38.3	1/14/15	22
3	-98.1	38.3	1/15/15	14
3	-98.1	38.3	1/16/15	23
3	-98.1	38.3	1/17/15	93
3	-98.1	38.3	1/18/15	23
3	-98.1	38.3	1/19/15	4
3	-98.1	38.3	1/20/15	53
3	-98.1	38.3	1/21/15	39
3	-98.1	38.3	1/22/15	30
3	-98.1	38.3	1/23/15	34
3	-98.1	38.3	1/25/15	22
3	-98.1	38.3	1/26/15	14
3	-98.1	38.3	1/27/15	23
3	-98.1	38.3	1/28/15	93
3	-98.1	38.3	1/29/15	23
3	-98.1	38.3	1/30/15	4
3	-98.1	38.3	1/31/15	53
3	-98.1	38.3	########	90
3	-98.1	38.3	########	95
3	1	1	########	30
3	2	2	########	30
3	3	3	########	27