Skip to content

Instantly share code, notes, and snippets.

@zachmayer
Created December 29, 2011 18:05
Show Gist options
  • Save zachmayer/1535323 to your computer and use it in GitHub Desktop.
Save zachmayer/1535323 to your computer and use it in GitHub Desktop.
Forecasting S&P 500
#Function to cross-validate a time series.
cv.ts <- function(x, FUN, tsControl, xreg=NULL, ...) {
#Load required packages
stopifnot(is.ts(x))
stopifnot(is.data.frame(xreg) | is.matrix(xreg) | is.null(xreg))
stopifnot(require(forecast))
stopifnot(require(foreach))
stopifnot(require(plyr))
#Load parameters from the tsControl list
stepSize <- tsControl$stepSize
maxHorizon <- tsControl$maxHorizon
minObs <- tsControl$minObs
fixedWindow <- tsControl$fixedWindow
summaryFunc <- tsControl$summaryFunc
preProcess <- tsControl$preProcess
#Make sure xreg object is long enough for last set of forecasts
if (! is.null(xreg)) {
xreg <- as.matrix(xreg)
if (nrow(xreg)<length(x)+maxHorizon) {
warning('xreg object too short to forecast beyond the length of the time series.
Appending NA values to xreg')
nRows <- (length(x)+maxHorizon)-nrow(xreg)
nCols <- dim(xreg)[2]
addRows <- matrix(rep(NA,nCols*nRows),nrow=nRows, ncol=nCols)
colnames(addRows) <- colnames(xreg)
xreg <- rbind(xreg,addRows)
}
}
#Define additional parameters
freq <- frequency(x)
n <- length(x)
st <- tsp(x)[1]+(minObs-2)/freq
#Create a matrix of actual values.
#X is the point in time, Y is the forecast horizon
#http://stackoverflow.com/questions/8140577/creating-a-matrix-of-future-values-for-a-time-series
formatActuals <- function(x,maxHorizon) {
actuals <- outer(seq_along(x), seq_len(maxHorizon), FUN="+")
actuals <- apply(actuals,2,function(a) x[a])
actuals
}
actuals <- formatActuals(x,maxHorizon)
actuals <- actuals[minObs:(length(x)-1),,drop=FALSE]
#Create a list of training windows
#Each entry of this list will be the same length, if fixed=TRUE
#At each point in time, calculate 'maxHorizon' forecasts ahead
steps <- seq(1,(n-minObs),by=stepSize)
forcasts <- foreach(i=steps, .combine=rbind, .multicombine=FALSE) %dopar% {
if (is.null(xreg)) {
if (fixedWindow) {
xshort <- window(x, start=st+(i-minObs+1)/freq, end=st+i/freq)
} else {
xshort <- window(x, end=st + i/freq)
}
if (preProcess) {
if (testObject(lambda)) {
stop("Don't specify a lambda parameter when preProcess==TRUE")
}
stepLambda <- BoxCox.lambda(xshort, method='loglik')
xshort <- BoxCox(xshort, stepLambda)
}
out <- FUN(xshort, h=maxHorizon, ...)
if (preProcess) {
out <- InvBoxCox(out, stepLambda)
}
return(out)
} else if (! is.null(xreg)) {
if (fixedWindow) {
xshort <- window(x, start=st+(i-minObs+1)/freq, end=st+i/freq)
xregshort <- xreg[((i):(i+minObs-1)),,drop=FALSE]
} else {
xshort <- window(x, end=st + i/freq)
xregshort <- xreg[(1:(i+minObs-1)),,drop=FALSE]
}
newxreg <- xreg[(i+minObs):(i+minObs-1+maxHorizon),,drop=FALSE]
if (preProcess) {
if (testObject(lambda)) {
stop("Don't specify a lambda parameter when preProcess==TRUE")
}
stepLambda <- BoxCox.lambda(xshort, method='loglik')
xshort <- BoxCox(xshort, stepLambda)
}
out <- FUN(xshort, h=maxHorizon,
xreg=xregshort, newxreg=newxreg, ...)
if (preProcess) {
out <- InvBoxCox(out, stepLambda)
}
return(out)
}
}
#Extract the actuals we actually want to use
actuals <- actuals[steps,,drop=FALSE]
#Accuracy at each horizon
out <- data.frame(
ldply(1:maxHorizon,
function(horizon) {
P <- forcasts[,horizon,drop=FALSE]
A <- na.omit(actuals[,horizon,drop=FALSE])
P <- P[1:length(A)]
P <- na.omit(P)
A <- A[1:length(P)]
summaryFunc(P,A)
}
)
)
#Add average accuracy, across all horizons
overall <- colMeans(out)
out <- rbind(out,overall)
#Add a column for which horizon and output
return(data.frame(horizon=c(1:maxHorizon,'All'),out))
}
#Summary function for time series cross-validation
stopifnot(require(compiler))
testObject <- function(object){
exists(as.character(substitute(object)))
}
tsSummary <- cmpfun(function(P,A) {
data.frame(t(accuracy(P,A)))
})
#Forecasting wrappers
meanForecast <- cmpfun(function(x,h,...) {
require(forecast)
meanf(x, h, ..., level=99)$mean
})
naiveForecast <- cmpfun(function(x,h,...) {
require(forecast)
naive(x, h, ..., level=99)$mean
})
auto.arimaForecast <- cmpfun(function(x,h,xreg=NULL,newxreg=NULL,...) {
require(forecast)
fit <- auto.arima(x, xreg=xreg, ...)
forecast(fit, h=h, level=99, xreg=newxreg)$mean
})
#Setup
set.seed(1)
library(quantmod)
library(forecast)
#Load data
getSymbols('^GSPC', from='1980-01-01')
#Simplify to monthly level
GSPC <- to.monthly(GSPC)
Data <- as.ts(Cl(GSPC), start=1980)
#Setup model cross-validation
myControl <- list( minObs=12,
stepSize=1,
maxHorizon=12,
fixedWindow=TRUE,
preProcess=FALSE,
summaryFunc=tsSummary
)
#Cross validate 3 models (model 3 is SLOW!)
model1 <- cv.ts(Data, meanForecast, myControl)
model2 <- cv.ts(Data, naiveForecast, myControl)
model3 <- cv.ts(Data, auto.arimaForecast, myControl, ic='bic', trace=TRUE)
#Find the RMSE for each model and create a matrix with our results
models <- list(model1,model2,model3)
models <- lapply(models, function(x) x[1:12,'RMSE'])
results <- do.call(cbind,models)
colnames(results) <- c('mean','naive','ar')
#Order by average RMSE for the 1st 3 months
results <- t(results)
order <- rowMeans(results[,1:3])
results <- results[order(order),]
print(results)
#Plot
color <- 1:nrow(results)
plot(results[1,], col=1, type='l', ylim=c(min(results),max(results)))
for (i in color) {
lines(results[i,],col=i)
}
legend("topleft",legend=row.names(results),col=color,lty=1)
#Function to cross-validate a time series.
cv.ts <- function(x, FUN, tsControl, xreg=NULL, ...) {
#Load required packages
stopifnot(is.ts(x))
stopifnot(is.data.frame(xreg) | is.matrix(xreg) | is.null(xreg))
stopifnot(require(forecast))
stopifnot(require(foreach))
stopifnot(require(plyr))
#Load parameters from the tsControl list
stepSize <- tsControl$stepSize
maxHorizon <- tsControl$maxHorizon
minObs <- tsControl$minObs
fixedWindow <- tsControl$fixedWindow
summaryFunc <- tsControl$summaryFunc
preProcess <- tsControl$preProcess
#Make sure xreg object is long enough for last set of forecasts
if (! is.null(xreg)) {
xreg <- as.matrix(xreg)
if (nrow(xreg)<length(x)+maxHorizon) {
warning('xreg object too short to forecast beyond the length of the time series.
Appending NA values to xreg')
nRows <- (length(x)+maxHorizon)-nrow(xreg)
nCols <- dim(xreg)[2]
addRows <- matrix(rep(NA,nCols*nRows),nrow=nRows, ncol=nCols)
colnames(addRows) <- colnames(xreg)
xreg <- rbind(xreg,addRows)
}
}
#Define additional parameters
freq <- frequency(x)
n <- length(x)
st <- tsp(x)[1]+(minObs-2)/freq
#Create a matrix of actual values.
#X is the point in time, Y is the forecast horizon
#http://stackoverflow.com/questions/8140577/creating-a-matrix-of-future-values-for-a-time-series
formatActuals <- function(x,maxHorizon) {
actuals <- outer(seq_along(x), seq_len(maxHorizon), FUN="+")
actuals <- apply(actuals,2,function(a) x[a])
actuals
}
actuals <- formatActuals(x,maxHorizon)
actuals <- actuals[minObs:(length(x)-1),,drop=FALSE]
#Create a list of training windows
#Each entry of this list will be the same length, if fixed=TRUE
#At each point in time, calculate 'maxHorizon' forecasts ahead
steps <- seq(1,(n-minObs),by=stepSize)
forcasts <- foreach(i=steps, .combine=rbind, .multicombine=FALSE) %dopar% {
if (is.null(xreg)) {
if (fixedWindow) {
xshort <- window(x, start=st+(i-minObs+1)/freq, end=st+i/freq)
} else {
xshort <- window(x, end=st + i/freq)
}
if (preProcess) {
if (testObject(lambda)) {
stop("Don't specify a lambda parameter when preProcess==TRUE")
}
stepLambda <- BoxCox.lambda(xshort, method='loglik')
xshort <- BoxCox(xshort, stepLambda)
}
out <- FUN(xshort, h=maxHorizon, ...)
if (preProcess) {
out <- InvBoxCox(out, stepLambda)
}
return(out)
} else if (! is.null(xreg)) {
if (fixedWindow) {
xshort <- window(x, start=st+(i-minObs+1)/freq, end=st+i/freq)
xregshort <- xreg[((i):(i+minObs-1)),,drop=FALSE]
} else {
xshort <- window(x, end=st + i/freq)
xregshort <- xreg[(1:(i+minObs-1)),,drop=FALSE]
}
newxreg <- xreg[(i+minObs):(i+minObs-1+maxHorizon),,drop=FALSE]
if (preProcess) {
if (testObject(lambda)) {
stop("Don't specify a lambda parameter when preProcess==TRUE")
}
stepLambda <- BoxCox.lambda(xshort, method='loglik')
xshort <- BoxCox(xshort, stepLambda)
}
out <- FUN(xshort, h=maxHorizon,
xreg=xregshort, newxreg=newxreg, ...)
if (preProcess) {
out <- InvBoxCox(out, stepLambda)
}
return(out)
}
}
#Extract the actuals we actually want to use
actuals <- actuals[steps,,drop=FALSE]
#Accuracy at each horizon
out <- data.frame(
ldply(1:maxHorizon,
function(horizon) {
P <- forcasts[,horizon,drop=FALSE]
A <- na.omit(actuals[,horizon,drop=FALSE])
P <- P[1:length(A)]
P <- na.omit(P)
A <- A[1:length(P)]
summaryFunc(P,A)
}
)
)
#Add average accuracy, across all horizons
overall <- colMeans(out)
out <- rbind(out,overall)
#Add a column for which horizon and output
return(data.frame(horizon=c(1:maxHorizon,'All'),out))
}
#Summary function for time series cross-validation
stopifnot(require(compiler))
testObject <- function(object){
exists(as.character(substitute(object)))
}
tsSummary <- cmpfun(function(P,A) {
data.frame(t(accuracy(P,A)))
})
#Forecasting wrappers
meanForecast <- cmpfun(function(x,h,...) {
require(forecast)
meanf(x, h, ..., level=99)$mean
})
naiveForecast <- cmpfun(function(x,h,...) {
require(forecast)
naive(x, h, ..., level=99)$mean
})
auto.arimaForecast <- cmpfun(function(x,h,xreg=NULL,newxreg=NULL,...) {
require(forecast)
fit <- auto.arima(x, xreg=xreg, ...)
forecast(fit, h=h, level=99, xreg=newxreg)$mean
})
#Setup
set.seed(1)
library(quantmod)
library(forecast)
#Load data
getSymbols('^GSPC', from='1980-01-01')
#Simplify to monthly level
GSPC <- to.monthly(GSPC)
Data <- as.ts(Cl(GSPC), start=1980)
#Setup model cross-validation
myControl <- list( minObs=12,
stepSize=1,
maxHorizon=12,
fixedWindow=TRUE,
preProcess=FALSE,
summaryFunc=tsSummary
)
#Cross validate 3 models (model 3 is SLOW!)
model1 <- cv.ts(Data, meanForecast, myControl)
model2 <- cv.ts(Data, naiveForecast, myControl)
model3 <- cv.ts(Data, auto.arimaForecast, myControl, ic='bic')
#Find the RMSE for each model and create a matrix with our results
models <- list(model1,model2,model3)
models <- lapply(models, function(x) x[1:12,'RMSE'])
results <- do.call(cbind,models)
colnames(results) <- c('mean','naive','ar')
#Order by average RMSE for the 1st 3 months
results <- t(results)
order <- rowMeans(results[,1:3])
results <- results[order(order),]
print(results)
#Plot
color <- 1:nrow(results)
plot(results[1,], col=1, type='l', ylim=c(min(results),max(results)))
for (i in color) {
lines(results[i,],col=i)
}
legend("topleft",legend=row.names(results),col=color,lty=1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment