Skip to content

Instantly share code, notes, and snippets.

@mbjoseph
Created May 19, 2017 19:39
Show Gist options
  • Save mbjoseph/99cc32076bc92e0c273259d5b6c74b7a to your computer and use it in GitHub Desktop.
Save mbjoseph/99cc32076bc92e0c273259d5b6c74b7a to your computer and use it in GitHub Desktop.
Using basis functions to predict weather in Boulder, CO
library(raster)
library(tidyverse)
library(lubridate)
library(mgcv)
# Boulder temperature prediction ------------------------------------------
# Goal: use historical temperature data to predict the weather a week from today
# Fetch raw data from NOAA
data_url <- "https://www.esrl.noaa.gov/psd/boulder/data/boulderdaily.complete"
raw_dest <- "boulder_weather.txt"
download.file(data_url, destfile = raw_dest)
## Parse raw data --------------------------------------------
# The data are provided in a not-so-machine-readable format.
# Let's tidy the data so that we have a data frame with one row per day
data_lines <- readLines(raw_dest) %>%
trimws()
# find lines with numeric data
numeric_lines <- grep(pattern = "^[[:digit:]]", x = data_lines, value = TRUE)
# figure out what the column names should be
column_names <- grep(pattern = "Format\\:", x = data_lines, value = TRUE) %>%
gsub(pattern = "Format\\: ", replacement = "") %>%
strsplit(split = ", ") %>%
unlist()
# write the numeric data to a text file and read it into a data frame
numeric_dest <- "boulder_weather_data.txt"
numeric_lines %>%
writeLines(numeric_dest)
replace_nas <- function(x) {
# replaces -998 and -999 fill values with NA
ifelse(x < -997, NA, x)
}
d <- read_fwf(numeric_dest,
fwf_positions(c(1, 6, 9, 16, 23, 29, 35, 44),
c(4, 7, 10, 17, 24, 32, 40, 47),
col_names = column_names)) %>%
mutate_all(replace_nas) %>%
select(year, mon, day, tmax) %>%
mutate(ymd = paste(year, sprintf('%02d', mon), sprintf('%02d', day),
sep = "-"),
ymd = ymd(ymd)) %>%
filter(!is.na(ymd))
# create a prediction data frame with NA values to be imputed
may2017_df <- tibble(year = 2017, mon = 5, day = 1:31) %>%
mutate(ymd = paste(year, sprintf('%02d', mon), sprintf('%02d', day),
sep = "-"),
ymd = ymd(ymd))
# append to our original data frame
d <- bind_rows(d, may2017_df)
# create a numeric "x" value from ymd
d <- d %>%
mutate(numeric_day = as.numeric(ymd))
# Exploring the data ------------------------------------------------------
# subset to a smaller time range for ease of visualization
d <- d %>%
filter(year > 2007)
# visualize temperature over time
d %>%
ggplot(aes(x = ymd, y = tmax)) +
geom_point()
# hmmmm - that looks wrong
d %>%
filter(tmax < 4)
# fix some obviously wrong values
d <- d %>%
mutate(tmax = ifelse(tmax < 4, NA, tmax))
# Constructing a periodic basis expansion with trigonometry -------------
# Maybe a sin function? Just plotting a few:
plot(sin(d$numeric_day), type = "l")
plot(sin(.1 * d$numeric_day), type = "l")
plot(sin(.01 * d$numeric_day), type = "l")
# now let's dump a bunch of these into a matrix (our basis expansion)
n_basis <- 30
sin_coefs <- exp(seq(-7, -3, length.out = n_basis))
sin_basis <- matrix(nrow = nrow(d), ncol = length(sin_coefs))
for (i in seq_along(sin_coefs)) {
# each column here is a basis vector
sin_basis[, i] <- sin(sin_coefs[i] * d$numeric_day) %>%
scale %>%
c
}
# let's plot the matrix to see what our basis expansion looks like
raster::plot(raster::raster(sin_basis))
# another look:
sin_basis %>%
tbl_df %>%
gather(coef, value) %>%
mutate(numeric_day = rep(d$numeric_day, n_basis)) %>%
left_join(select(d, numeric_day, ymd)) %>%
ggplot(aes(x = ymd, y = value)) +
geom_line(alpha = .6) +
theme_bw() +
facet_wrap(~coef)
# now, name the columns to associate the coefficients
colnames(sin_basis) <- paste("coef", sin_coefs, sep = "_")
# merge the basis expansion with the original data
full_d <- bind_cols(d, tbl_df(sin_basis))
# Model fitting -----------------------------------------------------------
# We will use the basis functions as covariates, estimating a regression coef
# for each
model_formula <- as.formula(paste("tmax ~",
paste(colnames(sin_basis), collapse = "+")
)
)
m <- lm(model_formula, data = full_d)
# ^ notice that we are basically just fitting a linear model!
# make predictions from the model
pred_df <- bind_cols(full_d,
tbl_df(predict(m, full_d, interval = "confidence")))
# visualize the predictions
pred_df %>%
ggplot(aes(x = ymd, y = tmax)) +
geom_point() +
geom_line(aes(y = fit), color = "dodgerblue", size = 2) +
geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = .2, fill = "dodgerblue") +
theme_minimal()
# get the prediction for May 28, 2017
pred_df %>%
select(-starts_with("coef")) %>%
filter(year == 2017, mon == 5, day == 28)
# which basis functions had which coefficients?
plot(x = sin_coefs, y = coef(m)[-1],
xlab = "sin function coefficients",
ylab = "Estimated regression coefficients")
abline(h = 0, lty = 2)
# Using a canned basis function approach -----------------------------
# gam is "Generalize additive model"
gam_fit <- mgcv::gam(tmax ~ s(numeric_day, bs = "cs", k = n_basis),
data = full_d)
# make prediction
gam_preds <- predict(gam_fit, full_d, se.fit = TRUE)
plot(full_d$numeric_day, full_d$tmax, cex = .5,
xlab = "Numeric day", ylab = "Tmax")
lines(full_d$numeric_day, gam_preds$fit, col = "red")
lines(full_d$numeric_day, gam_preds$fit - gam_preds$se.fit, lty = 2,
col = "red")
lines(full_d$numeric_day, gam_preds$fit + gam_preds$se.fit, lty = 2,
col = "red")
# Visualize the basis functions that were used in our gam
um <- smoothCon(s(numeric_day, bs = "cs", k = n_basis), data = full_d,
knots = NULL, absorb.cons = TRUE)
X <- um[[1]]$X
# notice that these are "local" basis functions with compact support
raster::plot(raster::raster(X))
X %>%
tbl_df %>%
gather(coef, value) %>%
mutate(numeric_day = rep(d$numeric_day, ncol(X))) %>%
left_join(select(d, numeric_day, ymd)) %>%
ggplot(aes(x = ymd, y = value)) +
geom_line(alpha = .6) +
theme_bw() +
facet_wrap(~coef)
# notice the seasonal pattern in the basis coefficients
plot(coef(gam_fit)[-1], type = "b", ylab = "Basis coefficient")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment