zachmayer/1. Functions.R

## 1. Functions.R
#Function to cross-validate a time series.
cv.ts <- function(x, FUN, tsControl, xreg=NULL, ...) {

	#Load required packages
	stopifnot(is.ts(x))
	stopifnot(is.data.frame(xreg) | is.matrix(xreg) | is.null(xreg))
	stopifnot(require(forecast))
	stopifnot(require(foreach))
	stopifnot(require(plyr))

	#Load parameters from the tsControl list
	stepSize <- tsControl$stepSize
	maxHorizon <- tsControl$maxHorizon
	minObs <- tsControl$minObs
	fixedWindow <- tsControl$fixedWindow
	summaryFunc <- tsControl$summaryFunc
  preProcess <- tsControl$preProcess

	#Make sure xreg object is long enough for last set of forecasts
	if (! is.null(xreg)) {
		xreg <- as.matrix(xreg)

		if (nrow(xreg)<length(x)+maxHorizon) {
			warning('xreg object too short to forecast beyond the length of the time series.
					Appending NA values to xreg')
			nRows <- (length(x)+maxHorizon)-nrow(xreg)
			nCols <- dim(xreg)[2]
			addRows <- matrix(rep(NA,nCols*nRows),nrow=nRows, ncol=nCols)
			colnames(addRows) <- colnames(xreg)
			xreg <- rbind(xreg,addRows)
		}

	}

	#Define additional parameters
	freq <- frequency(x)
	n <- length(x)
	st <- tsp(x)[1]+(minObs-2)/freq

	#Create a matrix of actual values.
	#X is the point in time, Y is the forecast horizon
	#http://stackoverflow.com/questions/8140577/creating-a-matrix-of-future-values-for-a-time-series
	formatActuals <- function(x,maxHorizon) {
		actuals <- outer(seq_along(x), seq_len(maxHorizon), FUN="+")
		actuals <- apply(actuals,2,function(a) x[a])
		actuals
	}

	actuals <- formatActuals(x,maxHorizon)
	actuals <- actuals[minObs:(length(x)-1),,drop=FALSE]

	#Create a list of training windows
	#Each entry of this list will be the same length, if fixed=TRUE

	#At each point in time, calculate 'maxHorizon' forecasts ahead
	steps <- seq(1,(n-minObs),by=stepSize)
	forcasts <- foreach(i=steps, .combine=rbind, .multicombine=FALSE) %dopar% {

		if (is.null(xreg)) {
			if (fixedWindow) {
				xshort <- window(x, start=st+(i-minObs+1)/freq, end=st+i/freq)

			} else {
				xshort <- window(x, end=st + i/freq)
			}

      if (preProcess) {
        if (testObject(lambda)) {
          stop("Don't specify a lambda parameter when preProcess==TRUE")
        }
        stepLambda <- BoxCox.lambda(xshort, method='loglik')
        xshort <- BoxCox(xshort, stepLambda)
      }

			out <- FUN(xshort, h=maxHorizon, ...)

      if (preProcess) {
        out <- InvBoxCox(out, stepLambda)
      }

      return(out)

		} else if (! is.null(xreg)) {
			if (fixedWindow) {
				xshort <- window(x, start=st+(i-minObs+1)/freq, end=st+i/freq)
				xregshort <- xreg[((i):(i+minObs-1)),,drop=FALSE]
			} else {
				xshort <- window(x, end=st + i/freq)
				xregshort <- xreg[(1:(i+minObs-1)),,drop=FALSE]
			}
			newxreg <- xreg[(i+minObs):(i+minObs-1+maxHorizon),,drop=FALSE]

      if (preProcess) {
        if (testObject(lambda)) {
          stop("Don't specify a lambda parameter when preProcess==TRUE")
        }
        stepLambda <- BoxCox.lambda(xshort, method='loglik')
        xshort <- BoxCox(xshort, stepLambda)
      }

			out <- FUN(xshort, h=maxHorizon,
                xreg=xregshort, newxreg=newxreg, ...)

      if (preProcess) {
        out <- InvBoxCox(out, stepLambda)
      }

      return(out)
		}


	}

	#Extract the actuals we actually want to use
	actuals <- actuals[steps,,drop=FALSE]

	#Accuracy at each horizon
	out <- data.frame(
					ldply(1:maxHorizon,
						function(horizon) {
							P <- forcasts[,horizon,drop=FALSE]
							A <- na.omit(actuals[,horizon,drop=FALSE])
							P <- P[1:length(A)]
							P <- na.omit(P)
							A <- A[1:length(P)]
							summaryFunc(P,A)
						}
					)
				)

	#Add average accuracy, across all horizons
	overall <- colMeans(out)
	out <- rbind(out,overall)

	#Add a column for which horizon and output
	return(data.frame(horizon=c(1:maxHorizon,'All'),out))
}

#Summary function for time series cross-validation
stopifnot(require(compiler))
testObject <- function(object){
  exists(as.character(substitute(object)))
}
tsSummary <- cmpfun(function(P,A) {
	data.frame(t(accuracy(P,A)))
})

#Forecasting wrappers
meanForecast <- cmpfun(function(x,h,...) {
	require(forecast)
	meanf(x, h, ..., level=99)$mean
})

naiveForecast <- cmpfun(function(x,h,...) {
  require(forecast)
	naive(x, h, ..., level=99)$mean
})

auto.arimaForecast <- cmpfun(function(x,h,xreg=NULL,newxreg=NULL,...) {
  require(forecast)
  fit <- auto.arima(x, xreg=xreg, ...)
  forecast(fit, h=h, level=99, xreg=newxreg)$mean
})

## 2. Load Data.R
#Setup
set.seed(1)
library(quantmod)
library(forecast)

#Load data
getSymbols('^GSPC', from='1980-01-01')

#Simplify to monthly level
GSPC <- to.monthly(GSPC)
Data <- as.ts(Cl(GSPC), start=1980)

## 3. Cross-Validate.R
#Setup model cross-validation
myControl <- list(  minObs=12,
						stepSize=1,
						maxHorizon=12,
						fixedWindow=TRUE,
            preProcess=FALSE,
						summaryFunc=tsSummary
					)

#Cross validate 3 models (model 3 is SLOW!)
model1 <- cv.ts(Data, meanForecast, myControl)
model2 <- cv.ts(Data, naiveForecast, myControl)
model3 <- cv.ts(Data, auto.arimaForecast, myControl, ic='bic', trace=TRUE)

#Find the RMSE for each model and create a matrix with our results
models <- list(model1,model2,model3)
models <- lapply(models, function(x) x[1:12,'RMSE'])
results <- do.call(cbind,models)
colnames(results) <- c('mean','naive','ar')

#Order by average RMSE for the 1st 3 months
results <- t(results)
order <- rowMeans(results[,1:3])
results <- results[order(order),]
print(results)

## 4. Plot results
#Plot
color <- 1:nrow(results)
plot(results[1,], col=1, type='l', ylim=c(min(results),max(results)))
for (i in color) {
  lines(results[i,],col=i)
}
legend("topleft",legend=row.names(results),col=color,lty=1)

## 5. All together now.R
#Function to cross-validate a time series.
cv.ts <- function(x, FUN, tsControl, xreg=NULL, ...) {

	#Load required packages
	stopifnot(is.ts(x))
	stopifnot(is.data.frame(xreg) | is.matrix(xreg) | is.null(xreg))
	stopifnot(require(forecast))
	stopifnot(require(foreach))
	stopifnot(require(plyr))

	#Load parameters from the tsControl list
	stepSize <- tsControl$stepSize
	maxHorizon <- tsControl$maxHorizon
	minObs <- tsControl$minObs
	fixedWindow <- tsControl$fixedWindow
	summaryFunc <- tsControl$summaryFunc
  preProcess <- tsControl$preProcess

	#Make sure xreg object is long enough for last set of forecasts
	if (! is.null(xreg)) {
		xreg <- as.matrix(xreg)

		if (nrow(xreg)<length(x)+maxHorizon) {
			warning('xreg object too short to forecast beyond the length of the time series.
					Appending NA values to xreg')
			nRows <- (length(x)+maxHorizon)-nrow(xreg)
			nCols <- dim(xreg)[2]
			addRows <- matrix(rep(NA,nCols*nRows),nrow=nRows, ncol=nCols)
			colnames(addRows) <- colnames(xreg)
			xreg <- rbind(xreg,addRows)
		}

	}

	#Define additional parameters
	freq <- frequency(x)
	n <- length(x)
	st <- tsp(x)[1]+(minObs-2)/freq

	#Create a matrix of actual values.
	#X is the point in time, Y is the forecast horizon
	#http://stackoverflow.com/questions/8140577/creating-a-matrix-of-future-values-for-a-time-series
	formatActuals <- function(x,maxHorizon) {
		actuals <- outer(seq_along(x), seq_len(maxHorizon), FUN="+")
		actuals <- apply(actuals,2,function(a) x[a])
		actuals
	}

	actuals <- formatActuals(x,maxHorizon)
	actuals <- actuals[minObs:(length(x)-1),,drop=FALSE]

	#Create a list of training windows
	#Each entry of this list will be the same length, if fixed=TRUE

	#At each point in time, calculate 'maxHorizon' forecasts ahead
	steps <- seq(1,(n-minObs),by=stepSize)
	forcasts <- foreach(i=steps, .combine=rbind, .multicombine=FALSE) %dopar% {

		if (is.null(xreg)) {
			if (fixedWindow) {
				xshort <- window(x, start=st+(i-minObs+1)/freq, end=st+i/freq)

			} else {
				xshort <- window(x, end=st + i/freq)
			}

      if (preProcess) {
        if (testObject(lambda)) {
          stop("Don't specify a lambda parameter when preProcess==TRUE")
        }
        stepLambda <- BoxCox.lambda(xshort, method='loglik')
        xshort <- BoxCox(xshort, stepLambda)
      }

			out <- FUN(xshort, h=maxHorizon, ...)

      if (preProcess) {
        out <- InvBoxCox(out, stepLambda)
      }

      return(out)

		} else if (! is.null(xreg)) {
			if (fixedWindow) {
				xshort <- window(x, start=st+(i-minObs+1)/freq, end=st+i/freq)
				xregshort <- xreg[((i):(i+minObs-1)),,drop=FALSE]
			} else {
				xshort <- window(x, end=st + i/freq)
				xregshort <- xreg[(1:(i+minObs-1)),,drop=FALSE]
			}
			newxreg <- xreg[(i+minObs):(i+minObs-1+maxHorizon),,drop=FALSE]

      if (preProcess) {
        if (testObject(lambda)) {
          stop("Don't specify a lambda parameter when preProcess==TRUE")
        }
        stepLambda <- BoxCox.lambda(xshort, method='loglik')
        xshort <- BoxCox(xshort, stepLambda)
      }

			out <- FUN(xshort, h=maxHorizon,
                xreg=xregshort, newxreg=newxreg, ...)

      if (preProcess) {
        out <- InvBoxCox(out, stepLambda)
      }

      return(out)
		}


	}

	#Extract the actuals we actually want to use
	actuals <- actuals[steps,,drop=FALSE]

	#Accuracy at each horizon
	out <- data.frame(
					ldply(1:maxHorizon,
						function(horizon) {
							P <- forcasts[,horizon,drop=FALSE]
							A <- na.omit(actuals[,horizon,drop=FALSE])
							P <- P[1:length(A)]
							P <- na.omit(P)
							A <- A[1:length(P)]
							summaryFunc(P,A)
						}
					)
				)

	#Add average accuracy, across all horizons
	overall <- colMeans(out)
	out <- rbind(out,overall)

	#Add a column for which horizon and output
	return(data.frame(horizon=c(1:maxHorizon,'All'),out))
}

#Summary function for time series cross-validation
stopifnot(require(compiler))
testObject <- function(object){
  exists(as.character(substitute(object)))
}
tsSummary <- cmpfun(function(P,A) {
	data.frame(t(accuracy(P,A)))
})

#Forecasting wrappers
meanForecast <- cmpfun(function(x,h,...) {
	require(forecast)
	meanf(x, h, ..., level=99)$mean
})

naiveForecast <- cmpfun(function(x,h,...) {
  require(forecast)
	naive(x, h, ..., level=99)$mean
})

auto.arimaForecast <- cmpfun(function(x,h,xreg=NULL,newxreg=NULL,...) {
  require(forecast)
  fit <- auto.arima(x, xreg=xreg, ...)
  forecast(fit, h=h, level=99, xreg=newxreg)$mean
})

#Setup
set.seed(1)
library(quantmod)
library(forecast)

#Load data
getSymbols('^GSPC', from='1980-01-01')

#Simplify to monthly level
GSPC <- to.monthly(GSPC)
Data <- as.ts(Cl(GSPC), start=1980)

#Setup model cross-validation
myControl <- list(  minObs=12,
						stepSize=1,
						maxHorizon=12,
						fixedWindow=TRUE,
            preProcess=FALSE,
						summaryFunc=tsSummary
					)

#Cross validate 3 models (model 3 is SLOW!)
model1 <- cv.ts(Data, meanForecast, myControl)
model2 <- cv.ts(Data, naiveForecast, myControl)
model3 <- cv.ts(Data, auto.arimaForecast, myControl, ic='bic')

#Find the RMSE for each model and create a matrix with our results
models <- list(model1,model2,model3)
models <- lapply(models, function(x) x[1:12,'RMSE'])
results <- do.call(cbind,models)
colnames(results) <- c('mean','naive','ar')

#Order by average RMSE for the 1st 3 months
results <- t(results)
order <- rowMeans(results[,1:3])
results <- results[order(order),]
print(results)

#Plot
color <- 1:nrow(results)
plot(results[1,], col=1, type='l', ylim=c(min(results),max(results)))
for (i in color) {
  lines(results[i,],col=i)
}
legend("topleft",legend=row.names(results),col=color,lty=1)
	#Function to cross-validate a time series.
	cv.ts <- function(x, FUN, tsControl, xreg=NULL, ...) {

	#Load required packages
	stopifnot(is.ts(x))
	stopifnot(is.data.frame(xreg) \| is.matrix(xreg) \| is.null(xreg))
	stopifnot(require(forecast))
	stopifnot(require(foreach))
	stopifnot(require(plyr))

	#Load parameters from the tsControl list
	stepSize <- tsControl$stepSize
	maxHorizon <- tsControl$maxHorizon
	minObs <- tsControl$minObs
	fixedWindow <- tsControl$fixedWindow
	summaryFunc <- tsControl$summaryFunc
	preProcess <- tsControl$preProcess

	#Make sure xreg object is long enough for last set of forecasts
	if (! is.null(xreg)) {
	xreg <- as.matrix(xreg)

	if (nrow(xreg)<length(x)+maxHorizon) {
	warning('xreg object too short to forecast beyond the length of the time series.
	Appending NA values to xreg')
	nRows <- (length(x)+maxHorizon)-nrow(xreg)
	nCols <- dim(xreg)[2]
	addRows <- matrix(rep(NA,nCols*nRows),nrow=nRows, ncol=nCols)
	colnames(addRows) <- colnames(xreg)
	xreg <- rbind(xreg,addRows)
	}

	}

	#Define additional parameters
	freq <- frequency(x)
	n <- length(x)
	st <- tsp(x)[1]+(minObs-2)/freq

	#Create a matrix of actual values.
	#X is the point in time, Y is the forecast horizon
	#http://stackoverflow.com/questions/8140577/creating-a-matrix-of-future-values-for-a-time-series
	formatActuals <- function(x,maxHorizon) {
	actuals <- outer(seq_along(x), seq_len(maxHorizon), FUN="+")
	actuals <- apply(actuals,2,function(a) x[a])
	actuals
	}

	actuals <- formatActuals(x,maxHorizon)
	actuals <- actuals[minObs:(length(x)-1),,drop=FALSE]

	#Create a list of training windows
	#Each entry of this list will be the same length, if fixed=TRUE

	#At each point in time, calculate 'maxHorizon' forecasts ahead
	steps <- seq(1,(n-minObs),by=stepSize)
	forcasts <- foreach(i=steps, .combine=rbind, .multicombine=FALSE) %dopar% {

	if (is.null(xreg)) {
	if (fixedWindow) {
	xshort <- window(x, start=st+(i-minObs+1)/freq, end=st+i/freq)

	} else {
	xshort <- window(x, end=st + i/freq)
	}

	if (preProcess) {
	if (testObject(lambda)) {
	stop("Don't specify a lambda parameter when preProcess==TRUE")
	}
	stepLambda <- BoxCox.lambda(xshort, method='loglik')
	xshort <- BoxCox(xshort, stepLambda)
	}

	out <- FUN(xshort, h=maxHorizon, ...)

	if (preProcess) {
	out <- InvBoxCox(out, stepLambda)
	}

	return(out)

	} else if (! is.null(xreg)) {
	if (fixedWindow) {
	xshort <- window(x, start=st+(i-minObs+1)/freq, end=st+i/freq)
	xregshort <- xreg[((i):(i+minObs-1)),,drop=FALSE]
	} else {
	xshort <- window(x, end=st + i/freq)
	xregshort <- xreg[(1:(i+minObs-1)),,drop=FALSE]
	}
	newxreg <- xreg[(i+minObs):(i+minObs-1+maxHorizon),,drop=FALSE]

	if (preProcess) {
	if (testObject(lambda)) {
	stop("Don't specify a lambda parameter when preProcess==TRUE")
	}
	stepLambda <- BoxCox.lambda(xshort, method='loglik')
	xshort <- BoxCox(xshort, stepLambda)
	}

	out <- FUN(xshort, h=maxHorizon,
	xreg=xregshort, newxreg=newxreg, ...)

	if (preProcess) {
	out <- InvBoxCox(out, stepLambda)
	}

	return(out)
	}


	}

	#Extract the actuals we actually want to use
	actuals <- actuals[steps,,drop=FALSE]

	#Accuracy at each horizon
	out <- data.frame(
	ldply(1:maxHorizon,
	function(horizon) {
	P <- forcasts[,horizon,drop=FALSE]
	A <- na.omit(actuals[,horizon,drop=FALSE])
	P <- P[1:length(A)]
	P <- na.omit(P)
	A <- A[1:length(P)]
	summaryFunc(P,A)
	}
	)
	)

	#Add average accuracy, across all horizons
	overall <- colMeans(out)
	out <- rbind(out,overall)

	#Add a column for which horizon and output
	return(data.frame(horizon=c(1:maxHorizon,'All'),out))
	}

	#Summary function for time series cross-validation
	stopifnot(require(compiler))
	testObject <- function(object){
	exists(as.character(substitute(object)))
	}
	tsSummary <- cmpfun(function(P,A) {
	data.frame(t(accuracy(P,A)))
	})

	#Forecasting wrappers
	meanForecast <- cmpfun(function(x,h,...) {
	require(forecast)
	meanf(x, h, ..., level=99)$mean
	})

	naiveForecast <- cmpfun(function(x,h,...) {
	require(forecast)
	naive(x, h, ..., level=99)$mean
	})

	auto.arimaForecast <- cmpfun(function(x,h,xreg=NULL,newxreg=NULL,...) {
	require(forecast)
	fit <- auto.arima(x, xreg=xreg, ...)
	forecast(fit, h=h, level=99, xreg=newxreg)$mean
	})
	#Setup
	set.seed(1)
	library(quantmod)
	library(forecast)

	#Load data
	getSymbols('^GSPC', from='1980-01-01')

	#Simplify to monthly level
	GSPC <- to.monthly(GSPC)
	Data <- as.ts(Cl(GSPC), start=1980)
	#Setup model cross-validation
	myControl <- list( minObs=12,
	stepSize=1,
	maxHorizon=12,
	fixedWindow=TRUE,
	preProcess=FALSE,
	summaryFunc=tsSummary
	)

	#Cross validate 3 models (model 3 is SLOW!)
	model1 <- cv.ts(Data, meanForecast, myControl)
	model2 <- cv.ts(Data, naiveForecast, myControl)
	model3 <- cv.ts(Data, auto.arimaForecast, myControl, ic='bic', trace=TRUE)

	#Find the RMSE for each model and create a matrix with our results
	models <- list(model1,model2,model3)
	models <- lapply(models, function(x) x[1:12,'RMSE'])
	results <- do.call(cbind,models)
	colnames(results) <- c('mean','naive','ar')

	#Order by average RMSE for the 1st 3 months
	results <- t(results)
	order <- rowMeans(results[,1:3])
	results <- results[order(order),]
	print(results)
	#Plot
	color <- 1:nrow(results)
	plot(results[1,], col=1, type='l', ylim=c(min(results),max(results)))
	for (i in color) {
	lines(results[i,],col=i)
	}
	legend("topleft",legend=row.names(results),col=color,lty=1)