Created
May 19, 2017 19:39
-
-
Save mbjoseph/99cc32076bc92e0c273259d5b6c74b7a to your computer and use it in GitHub Desktop.
Using basis functions to predict weather in Boulder, CO
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(raster) | |
library(tidyverse) | |
library(lubridate) | |
library(mgcv) | |
# Boulder temperature prediction ------------------------------------------ | |
# Goal: use historical temperature data to predict the weather a week from today | |
# Fetch raw data from NOAA | |
data_url <- "https://www.esrl.noaa.gov/psd/boulder/data/boulderdaily.complete" | |
raw_dest <- "boulder_weather.txt" | |
download.file(data_url, destfile = raw_dest) | |
## Parse raw data -------------------------------------------- | |
# The data are provided in a not-so-machine-readable format. | |
# Let's tidy the data so that we have a data frame with one row per day | |
data_lines <- readLines(raw_dest) %>% | |
trimws() | |
# find lines with numeric data | |
numeric_lines <- grep(pattern = "^[[:digit:]]", x = data_lines, value = TRUE) | |
# figure out what the column names should be | |
column_names <- grep(pattern = "Format\\:", x = data_lines, value = TRUE) %>% | |
gsub(pattern = "Format\\: ", replacement = "") %>% | |
strsplit(split = ", ") %>% | |
unlist() | |
# write the numeric data to a text file and read it into a data frame | |
numeric_dest <- "boulder_weather_data.txt" | |
numeric_lines %>% | |
writeLines(numeric_dest) | |
replace_nas <- function(x) { | |
# replaces -998 and -999 fill values with NA | |
ifelse(x < -997, NA, x) | |
} | |
d <- read_fwf(numeric_dest, | |
fwf_positions(c(1, 6, 9, 16, 23, 29, 35, 44), | |
c(4, 7, 10, 17, 24, 32, 40, 47), | |
col_names = column_names)) %>% | |
mutate_all(replace_nas) %>% | |
select(year, mon, day, tmax) %>% | |
mutate(ymd = paste(year, sprintf('%02d', mon), sprintf('%02d', day), | |
sep = "-"), | |
ymd = ymd(ymd)) %>% | |
filter(!is.na(ymd)) | |
# create a prediction data frame with NA values to be imputed | |
may2017_df <- tibble(year = 2017, mon = 5, day = 1:31) %>% | |
mutate(ymd = paste(year, sprintf('%02d', mon), sprintf('%02d', day), | |
sep = "-"), | |
ymd = ymd(ymd)) | |
# append to our original data frame | |
d <- bind_rows(d, may2017_df) | |
# create a numeric "x" value from ymd | |
d <- d %>% | |
mutate(numeric_day = as.numeric(ymd)) | |
# Exploring the data ------------------------------------------------------ | |
# subset to a smaller time range for ease of visualization | |
d <- d %>% | |
filter(year > 2007) | |
# visualize temperature over time | |
d %>% | |
ggplot(aes(x = ymd, y = tmax)) + | |
geom_point() | |
# hmmmm - that looks wrong | |
d %>% | |
filter(tmax < 4) | |
# fix some obviously wrong values | |
d <- d %>% | |
mutate(tmax = ifelse(tmax < 4, NA, tmax)) | |
# Constructing a periodic basis expansion with trigonometry ------------- | |
# Maybe a sin function? Just plotting a few: | |
plot(sin(d$numeric_day), type = "l") | |
plot(sin(.1 * d$numeric_day), type = "l") | |
plot(sin(.01 * d$numeric_day), type = "l") | |
# now let's dump a bunch of these into a matrix (our basis expansion) | |
n_basis <- 30 | |
sin_coefs <- exp(seq(-7, -3, length.out = n_basis)) | |
sin_basis <- matrix(nrow = nrow(d), ncol = length(sin_coefs)) | |
for (i in seq_along(sin_coefs)) { | |
# each column here is a basis vector | |
sin_basis[, i] <- sin(sin_coefs[i] * d$numeric_day) %>% | |
scale %>% | |
c | |
} | |
# let's plot the matrix to see what our basis expansion looks like | |
raster::plot(raster::raster(sin_basis)) | |
# another look: | |
sin_basis %>% | |
tbl_df %>% | |
gather(coef, value) %>% | |
mutate(numeric_day = rep(d$numeric_day, n_basis)) %>% | |
left_join(select(d, numeric_day, ymd)) %>% | |
ggplot(aes(x = ymd, y = value)) + | |
geom_line(alpha = .6) + | |
theme_bw() + | |
facet_wrap(~coef) | |
# now, name the columns to associate the coefficients | |
colnames(sin_basis) <- paste("coef", sin_coefs, sep = "_") | |
# merge the basis expansion with the original data | |
full_d <- bind_cols(d, tbl_df(sin_basis)) | |
# Model fitting ----------------------------------------------------------- | |
# We will use the basis functions as covariates, estimating a regression coef | |
# for each | |
model_formula <- as.formula(paste("tmax ~", | |
paste(colnames(sin_basis), collapse = "+") | |
) | |
) | |
m <- lm(model_formula, data = full_d) | |
# ^ notice that we are basically just fitting a linear model! | |
# make predictions from the model | |
pred_df <- bind_cols(full_d, | |
tbl_df(predict(m, full_d, interval = "confidence"))) | |
# visualize the predictions | |
pred_df %>% | |
ggplot(aes(x = ymd, y = tmax)) + | |
geom_point() + | |
geom_line(aes(y = fit), color = "dodgerblue", size = 2) + | |
geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = .2, fill = "dodgerblue") + | |
theme_minimal() | |
# get the prediction for May 28, 2017 | |
pred_df %>% | |
select(-starts_with("coef")) %>% | |
filter(year == 2017, mon == 5, day == 28) | |
# which basis functions had which coefficients? | |
plot(x = sin_coefs, y = coef(m)[-1], | |
xlab = "sin function coefficients", | |
ylab = "Estimated regression coefficients") | |
abline(h = 0, lty = 2) | |
# Using a canned basis function approach ----------------------------- | |
# gam is "Generalize additive model" | |
gam_fit <- mgcv::gam(tmax ~ s(numeric_day, bs = "cs", k = n_basis), | |
data = full_d) | |
# make prediction | |
gam_preds <- predict(gam_fit, full_d, se.fit = TRUE) | |
plot(full_d$numeric_day, full_d$tmax, cex = .5, | |
xlab = "Numeric day", ylab = "Tmax") | |
lines(full_d$numeric_day, gam_preds$fit, col = "red") | |
lines(full_d$numeric_day, gam_preds$fit - gam_preds$se.fit, lty = 2, | |
col = "red") | |
lines(full_d$numeric_day, gam_preds$fit + gam_preds$se.fit, lty = 2, | |
col = "red") | |
# Visualize the basis functions that were used in our gam | |
um <- smoothCon(s(numeric_day, bs = "cs", k = n_basis), data = full_d, | |
knots = NULL, absorb.cons = TRUE) | |
X <- um[[1]]$X | |
# notice that these are "local" basis functions with compact support | |
raster::plot(raster::raster(X)) | |
X %>% | |
tbl_df %>% | |
gather(coef, value) %>% | |
mutate(numeric_day = rep(d$numeric_day, ncol(X))) %>% | |
left_join(select(d, numeric_day, ymd)) %>% | |
ggplot(aes(x = ymd, y = value)) + | |
geom_line(alpha = .6) + | |
theme_bw() + | |
facet_wrap(~coef) | |
# notice the seasonal pattern in the basis coefficients | |
plot(coef(gam_fit)[-1], type = "b", ylab = "Basis coefficient") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment