Skip to content

Instantly share code, notes, and snippets.

@ivannp
Created December 1, 2012 03:18
Show Gist options
  • Star 9 You must be signed in to star a gist
  • Fork 20 You must be signed in to fork a gist
  • Save ivannp/4180399 to your computer and use it in GitHub Desktop.
Save ivannp/4180399 to your computer and use it in GitHub Desktop.
Back-testing SVM with e1071
svmComputeOneForecast = function(
id,
data,
response,
startPoints,
endPoints,
len,
history=500,
trace=FALSE,
kernel="radial",
gamma=10^(-5:-1),
cost=10^(0:2),
sampling="cross",
seed=1234,
featureSelection=c("add", "prune", "all"),
cross=10)
{
# Determine the forecast length
startIndex = startPoints[id]
endIndex = endPoints[id]
forecastLength = endIndex - startIndex + 1
# A row in the data is responsible for the corresponding value in the
# response. Thus, to forecast day X, we train the model on the previous
# *history* days, and then use the features for day X to forecast.
xtsData = data[index(data)[(startIndex-history):(startIndex-1)]]
xtsResponse = response[index(response)[(startIndex-history):(startIndex-1)]]
# Convert the input data and response to a matrix and a vector, respectively
xx = as.matrix( coredata( xtsData ) )
yy = as.vector( coredata( xtsResponse ) )
# We need to set the seed to have reprodcible results
set.seed( seed )
if(featureSelection[1] == "add") {
# We add the features one by one, until we cannot improve the error
best = NULL
bestPerf = 1e9
# Maintained sorted, the content are the column indexes in the original matrix
features = c()
availableFeatures = seq(1,ncol(xx))
# Use greedy approach to add features
repeat {
bestColIdToAdd = 0L
# print( features )
for(colId in 1:length(availableFeatures)) {
# Get the matrix for the current tunning and tune
zz = xx[,sort(c(features, availableFeatures[colId]))]
# print(paste(sep="", "trying adding feature ", availableFeatures[colId]))
newSvm = tune( svm,
train.x=zz,
train.y=yy,
ranges=list( gamma=gamma, cost=cost ),
tunecontrol=tune.control( sampling=sampling, cross=cross ),
kernel=kernel )
# Check the performance improvement
newPerf = round(newSvm$best.performance, 8)
# print( paste( sep="", "new performance=", newPerf ) )
if(newPerf < bestPerf) {
# print( paste( sep="", "old performance=", bestPerf, ", new performance=", newPerf ) )
best = newSvm
bestPerf = newPerf
bestColIdToAdd = colId
}
}
if(bestColIdToAdd > 0) {
# print( paste( sep="", "improvement, adding feature ", availableFeatures[bestColIdToAdd] ) )
# Found an improvement, update the features
features = sort(c(features, availableFeatures[bestColIdToAdd]))
availableFeatures = availableFeatures[-bestColIdToAdd]
# Exit if no features left
if(length(availableFeatures) == 0) break
} else {
# No improvements, done
break
}
}
} else {
# Train the SVM
# ss = svm( x=xx, y=yy, kernel=kernel, gamma=gamma[1], cost=cost[1] )
best = tune( svm,
train.x=xx,
train.y=yy,
ranges=list( gamma=gamma, cost=cost ),
tunecontrol=tune.control( sampling=sampling, cross=cross ),
kernel=kernel )
# print( "gotBest" )
# print( paste( sep="", "performance=", round( best$best.performance, 6 ) ) )
# An array to keep track of the original participating features (by index)
features = seq(1,ncol(xx))
# print( length( features ) )
# Use greedy approach to prune features
if(featureSelection[1] == "prune") {
repeat {
bestColIdToRemove = 0L
# print( features )
for(colId in 1:ncol(xx)) {
# Remove column colId
zz = xx[,-colId]
# print( paste( sep="", "trying without feature ", colId ) )
# Tune with the reduced number of columns
newBest = tune( svm,
train.x=zz,
train.y=yy,
ranges=list( gamma=gamma, cost=cost ),
tunecontrol=tune.control( sampling=sampling, cross=cross ),
kernel=kernel )
# print( paste( sep="", "new performance=", round( newBest$best.performance, 6 ) ) )
if(round( newBest$best.performance, 6 ) < round( best$best.performance, 6)) {
best = newBest
bestColIdToRemove = colId
# print( paste( sep="", "old performance=", round( best$best.performance, 6 ),
# ", new performance=", round( newBest$best.performance, 6 ) ) )
}
}
if(bestColIdToRemove > 0) {
# Found an improvement
xx = xx[,-bestColIdToRemove]
features = features[-bestColIdToRemove]
# print( paste( sep="", "improvement, removed feature ", bestColIdToRemove ) )
# Break if there is only a single feature left
if(length(features) == 1) break
} else {
# No improvements, done
break
}
}
}
}
# print( paste( sep="", "final features: (", paste( sep=",", collapse=",", features ), ")" ) )
# Predict using the SVM, use only the remaining features
xtsNewData = data[index(data)[startIndex:endIndex]]
newData = as.matrix( coredata( xtsNewData[,features] ) )
fore = predict( best$best.model, newData )
if( trace ) {
str = paste( sep="",
"\n", index(response)[startIndex], "\n",
"=======================\n",
" from: ", head(index(xtsResponse),1),
", to: ", tail(index(xtsResponse),1),
", length: ", length(index(xtsResponse)),
"\n new data: from: ", head(index(xtsNewData), 1),
", to: ", tail(index(xtsNewData), 1),
", length: ", NROW(xtsNewData),
"\n forecast length: ", forecastLength,
"\n best model performance: ", round( best$best.performance, 6 ),
"\n best model features: (", paste( collapse=",", features), ")",
"\n best model gamma: ", best$best.model$gamma,
"\n best model cost: ", best$best.model$cost,
"\n forecasts: ",
paste( collapse=", ", round( fore, 6 ) ),
"\n" )
cat( sep="", str )
}
return( list( index=startIndex,
forecasts=fore,
performance=best$best.performance,
features=features,
gamma=best$best.model$gamma,
cost=best$best.model$cost ) )
}
svmComputeForecasts = function(
data,
response,
history=500,
modelPeriod="days",
modelPeriodMultiple=1,
trace=TRUE,
startDate,
endDate,
kernel="radial",
gamma=10^(-5:-1),
cost=10^(0:2),
sampling="cross",
cross=10,
featureSelection=c("add", "prune", "all"),
cores)
{
require( e1071 )
stopifnot( NROW( data ) == NROW( response ) )
len = NROW( response )
# Determine the starting index
if( !missing( startDate ) )
{
startIndex = max( len - NROW( index( data[paste( sep="", startDate, "/" )] ) ) + 1,
history + 2 )
}
else
{
startIndex = history + 2
}
# Determine the ending index
if( missing( endDate ) )
{
lastIndex = len
}
else
{
lastIndex = NROW( index( data[paste( sep="", "/", endDate )] ) )
}
if( startIndex > lastIndex )
{
return( NULL )
}
modelPeriod = tolower( modelPeriod[1] )
forecasts = rep( NA, len )
gammas = rep( NA, len )
costs = rep( NA, len )
performances = rep( NA, len )
features = rep( "", len )
# Get the interesting indexes
periods = index(data)[startIndex:lastIndex]
# Compute the end points for each period (day, week, month, etc)
endPoints = endpoints( periods, modelPeriod, modelPeriodMultiple )
# Compute the starting points of each period, relative to the *data* index
startPoints = endPoints + startIndex
# Remove the last start point - it's outside
length(startPoints) = length(startPoints) - 1
# Make the end points relative to the *data* index
endPoints = endPoints + startIndex - 1
# Remove the first end point - it's always zero
endPoints = tail( endPoints, -1 )
stopifnot( length( endPoints ) == length( startPoints ) )
if( missing( cores ) ) {
cores = 1
}
res = mclapply( seq(1,length(startPoints)),
svmComputeOneForecast,
data=data,
response=response,
startPoints=startPoints,
endPoints=endPoints,
len=len,
history=history,
trace=TRUE,
kernel=kernel,
gamma=gamma,
cost=cost,
featureSelection=featureSelection,
mc.cores=cores )
for( ll in res )
{
# Prepare the indexes
ii = ll[["index"]]
jj = ii + NROW( ll[["forecasts"]] ) - 1
# Copy the output
forecasts[ii:jj] = ll[["forecasts"]]
gammas[ii:jj] = ll[["gamma"]]
costs[ii:jj] = ll[["cost"]]
performances[ii:jj] = ll[["performance"]]
# Encode the participating features as a bit mask stored in a single
# integer. This representation limits us to max 32 features.
features[ii:jj] = sum( 2^( ll[["features"]] - 1 ) )
}
sigUp = ifelse( forecasts >= 0, 1, 0 )
sigUp[is.na( sigUp )] = 0
sigDown = ifelse( forecasts < 0, -1, 0 )
sigDown[is.na( sigDown)] = 0
# forecasts[is.na( forecasts )] = 0
sig = sigUp + sigDown
res = merge( reclass( sig, response ),
reclass( sigUp, response ),
reclass( sigDown, response ),
na.trim( reclass( forecasts, response ) ),
reclass( performances, response ),
reclass( gammas, response ),
reclass( costs, response ),
reclass( features, response ),
all=F )
colnames( res ) = c( "Indicator", "Up", "Down", "Forecasts", "Performance", "Gamma", "Cost", "Features" )
return( res )
}
svmFeatures = function(series)
{
require(PerformanceAnalytics)
close = Cl(series)
rets = na.trim(ROC(close, type="discrete"))
# 1-day, 2-day, 3-day, 5-day, 10-day, 20-day and 50-day returns
res = merge(na.trim(lag(rets, 1)),
na.trim(lag(ROC(close, type="discrete", n=2), 1)),
na.trim(lag(ROC(close, type="discrete", n=3), 1)),
na.trim(lag(ROC(close, type="discrete", n=5), 1)),
na.trim(lag(ROC(close, type="discrete", n=10), 1)),
na.trim(lag(ROC(close, type="discrete", n=20), 1)),
na.trim(lag(ROC(close, type="discrete", n=50), 1)),
all=FALSE)
# Add mean, median, sd, mad, skew and kurtosis
res = merge(res,
xts(na.trim(lag(rollmean(rets, k=21, align="right"),1))),
xts(na.trim(lag(rollmedian(rets, k=21, align="right"),1))),
xts(na.trim(lag(rollapply(rets, width=21, align="right", FUN=sd),1))),
xts(na.trim(lag(rollapply(rets, width=21, align="right", FUN=mad),1))),
xts(na.trim(lag(rollapply(rets, width=21, align="right", FUN=skewness),1))),
xts(na.trim(lag(rollapply(rets, width=21, align="right", FUN=kurtosis),1))),
all=FALSE)
# Add volume with a lag of two
res = merge(res, xts(na.trim(lag(Vo(series),2))), all=FALSE)
colnames(res) = c("ROC.1", "ROC.2", "ROC.3", "ROC.5", "ROC.10", "ROC.20", "ROC.50",
"MEAN", "MEDIAN", "SD", "MAD", "SKEW", "KURTOSIS",
"VOLUME")
return(res)
}
@Rizwanrustam
Copy link

sir please can you suggest codes for how to forecast monthly uni variate data by SVM.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment