Time series forecasting with H2O.
install.packages("timetk") | |
install.packages("tidyquant") | |
library(h2o) # Awesome ML Library | |
library(timetk) # Toolkit for working with time series in R | |
library(tidyquant) # Loads tidyverse, financial pkgs, used to get data | |
beer_sales_tbl <- tq_get("S4248SM144NCEN", get = "economic.data", from = "2010-01-01", to = "2017-10-27") | |
beer_sales_tbl %>% | |
ggplot(aes(date, price)) + | |
# Train Region | |
annotate("text", x = ymd("2012-01-01"), y = 7000, | |
color = palette_light()[[1]], label = "Train Region") + | |
# Validation Region | |
geom_rect(xmin = as.numeric(ymd("2016-01-01")), | |
xmax = as.numeric(ymd("2016-12-31")), | |
ymin = 0, ymax = Inf, alpha = 0.02, | |
fill = palette_light()[[3]]) + | |
annotate("text", x = ymd("2016-07-01"), y = 7000, | |
color = palette_light()[[1]], label = "Validation\nRegion") + | |
# Test Region | |
geom_rect(xmin = as.numeric(ymd("2017-01-01")), | |
xmax = as.numeric(ymd("2017-08-31")), | |
ymin = 0, ymax = Inf, alpha = 0.02, | |
fill = palette_light()[[4]]) + | |
annotate("text", x = ymd("2017-05-01"), y = 7000, | |
color = palette_light()[[1]], label = "Test\nRegion") + | |
# Data | |
geom_line(col = palette_light()[1]) + | |
geom_point(col = palette_light()[1]) + | |
geom_ma(ma_fun = SMA, n = 12, size = 1) + | |
# Aesthetics | |
theme_tq() + | |
scale_x_date(date_breaks = "1 year", date_labels = "%Y") + | |
labs(title = "Beer Sales: 2007 through 2017", | |
subtitle = "Train, Validation, and Test Sets Shown") | |
beer_sales_tbl %>% glimpse() | |
beer_sales_tbl_aug <- beer_sales_tbl %>% tk_augment_timeseries_signature() | |
beer_sales_tbl_aug %>% glimpse() | |
beer_sales_tbl_clean <- beer_sales_tbl_aug %>% | |
select_if(~ !is.Date(.)) %>% | |
select_if(~ !any(is.na(.))) %>% | |
mutate_if(is.ordered, ~ as.character(.) %>% as.factor) | |
beer_sales_tbl_clean %>% glimpse() | |
train_tbl <- beer_sales_tbl_clean %>% filter(year < 2016) | |
valid_tbl <- beer_sales_tbl_clean %>% filter(year == 2016) | |
test_tbl <- beer_sales_tbl_clean %>% filter(year == 2017) | |
h2o.init() | |
h2o.no_progress() | |
train_h2o <- as.h2o(train_tbl) | |
valid_h2o <- as.h2o(valid_tbl) | |
test_h2o <- as.h2o(test_tbl) | |
y <- "price" | |
x <- setdiff(names(train_h2o), y) | |
automl_models_h2o <- h2o.automl( | |
x = x, | |
y = y, | |
training_frame = train_h2o, | |
validation_frame = valid_h2o, | |
leaderboard_frame = test_h2o, | |
max_runtime_secs = 60, | |
stopping_metric = "deviance") | |
automl_leader <- automl_models_h2o@leader | |
pred_h2o <- h2o.predict(automl_leader, newdata = test_h2o) | |
h2o.performance(automl_leader, newdata = test_h2o) | |
error_tbl <- beer_sales_tbl %>% | |
filter(lubridate::year(date) == 2017) %>% | |
add_column(pred = pred_h2o %>% as.tibble() %>% pull(predict)) %>% | |
rename(actual = price) %>% | |
mutate( | |
error = actual - pred, | |
error_pct = error / actual | |
) | |
error_tbl |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment