Skip to content

Instantly share code, notes, and snippets.

@antonywu
Forked from ivannp/e1071.R
Created July 10, 2013 23:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save antonywu/5970995 to your computer and use it in GitHub Desktop.
Save antonywu/5970995 to your computer and use it in GitHub Desktop.
svmComputeOneForecast = function(
id,
data,
response,
startPoints,
endPoints,
len,
history=500,
trace=FALSE,
kernel="radial",
gamma=10^(-5:-1),
cost=10^(0:2),
sampling="cross",
seed=1234,
featureSelection=c("add", "prune", "all"),
cross=10)
{
# Determine the forecast length
startIndex = startPoints[id]
endIndex = endPoints[id]
forecastLength = endIndex - startIndex + 1
# A row in the data is responsible for the corresponding value in the
# response. Thus, to forecast day X, we train the model on the previous
# *history* days, and then use the features for day X to forecast.
xtsData = data[index(data)[(startIndex-history):(startIndex-1)]]
xtsResponse = response[index(response)[(startIndex-history):(startIndex-1)]]
# Convert the input data and response to a matrix and a vector, respectively
xx = as.matrix( coredata( xtsData ) )
yy = as.vector( coredata( xtsResponse ) )
# We need to set the seed to have reprodcible results
set.seed( seed )
if(featureSelection[1] == "add") {
# We add the features one by one, until we cannot improve the error
best = NULL
bestPerf = 1e9
# Maintained sorted, the content are the column indexes in the original matrix
features = c()
availableFeatures = seq(1,ncol(xx))
# Use greedy approach to add features
repeat {
bestColIdToAdd = 0L
# print( features )
for(colId in 1:length(availableFeatures)) {
# Get the matrix for the current tunning and tune
zz = xx[,sort(c(features, availableFeatures[colId]))]
# print(paste(sep="", "trying adding feature ", availableFeatures[colId]))
newSvm = tune( svm,
train.x=zz,
train.y=yy,
ranges=list( gamma=gamma, cost=cost ),
tunecontrol=tune.control( sampling=sampling, cross=cross ),
kernel=kernel )
# Check the performance improvement
newPerf = round(newSvm$best.performance, 8)
# print( paste( sep="", "new performance=", newPerf ) )
if(newPerf < bestPerf) {
# print( paste( sep="", "old performance=", bestPerf, ", new performance=", newPerf ) )
best = newSvm
bestPerf = newPerf
bestColIdToAdd = colId
}
}
if(bestColIdToAdd > 0) {
# print( paste( sep="", "improvement, adding feature ", availableFeatures[bestColIdToAdd] ) )
# Found an improvement, update the features
features = sort(c(features, availableFeatures[bestColIdToAdd]))
availableFeatures = availableFeatures[-bestColIdToAdd]
# Exit if no features left
if(length(availableFeatures) == 0) break
} else {
# No improvements, done
break
}
}
} else {
# Train the SVM
# ss = svm( x=xx, y=yy, kernel=kernel, gamma=gamma[1], cost=cost[1] )
best = tune( svm,
train.x=xx,
train.y=yy,
ranges=list( gamma=gamma, cost=cost ),
tunecontrol=tune.control( sampling=sampling, cross=cross ),
kernel=kernel )
# print( "gotBest" )
# print( paste( sep="", "performance=", round( best$best.performance, 6 ) ) )
# An array to keep track of the original participating features (by index)
features = seq(1,ncol(xx))
# print( length( features ) )
# Use greedy approach to prune features
if(featureSelection[1] == "prune") {
repeat {
bestColIdToRemove = 0L
# print( features )
for(colId in 1:ncol(xx)) {
# Remove column colId
zz = xx[,-colId]
# print( paste( sep="", "trying without feature ", colId ) )
# Tune with the reduced number of columns
newBest = tune( svm,
train.x=zz,
train.y=yy,
ranges=list( gamma=gamma, cost=cost ),
tunecontrol=tune.control( sampling=sampling, cross=cross ),
kernel=kernel )
# print( paste( sep="", "new performance=", round( newBest$best.performance, 6 ) ) )
if(round( newBest$best.performance, 6 ) < round( best$best.performance, 6)) {
best = newBest
bestColIdToRemove = colId
# print( paste( sep="", "old performance=", round( best$best.performance, 6 ),
# ", new performance=", round( newBest$best.performance, 6 ) ) )
}
}
if(bestColIdToRemove > 0) {
# Found an improvement
xx = xx[,-bestColIdToRemove]
features = features[-bestColIdToRemove]
# print( paste( sep="", "improvement, removed feature ", bestColIdToRemove ) )
# Break if there is only a single feature left
if(length(features) == 1) break
} else {
# No improvements, done
break
}
}
}
}
# print( paste( sep="", "final features: (", paste( sep=",", collapse=",", features ), ")" ) )
# Predict using the SVM, use only the remaining features
xtsNewData = data[index(data)[startIndex:endIndex]]
newData = as.matrix( coredata( xtsNewData[,features] ) )
fore = predict( best$best.model, newData )
if( trace ) {
str = paste( sep="",
"\n", index(response)[startIndex], "\n",
"=======================\n",
" from: ", head(index(xtsResponse),1),
", to: ", tail(index(xtsResponse),1),
", length: ", length(index(xtsResponse)),
"\n new data: from: ", head(index(xtsNewData), 1),
", to: ", tail(index(xtsNewData), 1),
", length: ", NROW(xtsNewData),
"\n forecast length: ", forecastLength,
"\n best model performance: ", round( best$best.performance, 6 ),
"\n best model features: (", paste( collapse=",", features), ")",
"\n best model gamma: ", best$best.model$gamma,
"\n best model cost: ", best$best.model$cost,
"\n forecasts: ",
paste( collapse=", ", round( fore, 6 ) ),
"\n" )
cat( sep="", str )
}
return( list( index=startIndex,
forecasts=fore,
performance=best$best.performance,
features=features,
gamma=best$best.model$gamma,
cost=best$best.model$cost ) )
}
svmComputeForecasts = function(
data,
response,
history=500,
modelPeriod="days",
modelPeriodMultiple=1,
trace=TRUE,
startDate,
endDate,
kernel="radial",
gamma=10^(-5:-1),
cost=10^(0:2),
sampling="cross",
cross=10,
featureSelection=c("add", "prune", "all"),
cores)
{
require( e1071 )
stopifnot( NROW( data ) == NROW( response ) )
len = NROW( response )
# Determine the starting index
if( !missing( startDate ) )
{
startIndex = max( len - NROW( index( data[paste( sep="", startDate, "/" )] ) ) + 1,
history + 2 )
}
else
{
startIndex = history + 2
}
# Determine the ending index
if( missing( endDate ) )
{
lastIndex = len
}
else
{
lastIndex = NROW( index( data[paste( sep="", "/", endDate )] ) )
}
if( startIndex > lastIndex )
{
return( NULL )
}
modelPeriod = tolower( modelPeriod[1] )
forecasts = rep( NA, len )
gammas = rep( NA, len )
costs = rep( NA, len )
performances = rep( NA, len )
features = rep( "", len )
# Get the interesting indexes
periods = index(data)[startIndex:lastIndex]
# Compute the end points for each period (day, week, month, etc)
endPoints = endpoints( periods, modelPeriod, modelPeriodMultiple )
# Compute the starting points of each period, relative to the *data* index
startPoints = endPoints + startIndex
# Remove the last start point - it's outside
length(startPoints) = length(startPoints) - 1
# Make the end points relative to the *data* index
endPoints = endPoints + startIndex - 1
# Remove the first end point - it's always zero
endPoints = tail( endPoints, -1 )
stopifnot( length( endPoints ) == length( startPoints ) )
if( missing( cores ) ) {
cores = 1
}
res = mclapply( seq(1,length(startPoints)),
svmComputeOneForecast,
data=data,
response=response,
startPoints=startPoints,
endPoints=endPoints,
len=len,
history=history,
trace=TRUE,
kernel=kernel,
gamma=gamma,
cost=cost,
featureSelection=featureSelection,
mc.cores=cores )
for( ll in res )
{
# Prepare the indexes
ii = ll[["index"]]
jj = ii + NROW( ll[["forecasts"]] ) - 1
# Copy the output
forecasts[ii:jj] = ll[["forecasts"]]
gammas[ii:jj] = ll[["gamma"]]
costs[ii:jj] = ll[["cost"]]
performances[ii:jj] = ll[["performance"]]
# Encode the participating features as a bit mask stored in a single
# integer. This representation limits us to max 32 features.
features[ii:jj] = sum( 2^( ll[["features"]] - 1 ) )
}
sigUp = ifelse( forecasts >= 0, 1, 0 )
sigUp[is.na( sigUp )] = 0
sigDown = ifelse( forecasts < 0, -1, 0 )
sigDown[is.na( sigDown)] = 0
# forecasts[is.na( forecasts )] = 0
sig = sigUp + sigDown
res = merge( reclass( sig, response ),
reclass( sigUp, response ),
reclass( sigDown, response ),
na.trim( reclass( forecasts, response ) ),
reclass( performances, response ),
reclass( gammas, response ),
reclass( costs, response ),
reclass( features, response ),
all=F )
colnames( res ) = c( "Indicator", "Up", "Down", "Forecasts", "Performance", "Gamma", "Cost", "Features" )
return( res )
}
svmFeatures = function(series)
{
require(PerformanceAnalytics)
close = Cl(series)
rets = na.trim(ROC(close, type="discrete"))
# 1-day, 2-day, 3-day, 5-day, 10-day, 20-day and 50-day returns
res = merge(na.trim(lag(rets, 1)),
na.trim(lag(ROC(close, type="discrete", n=2), 1)),
na.trim(lag(ROC(close, type="discrete", n=3), 1)),
na.trim(lag(ROC(close, type="discrete", n=5), 1)),
na.trim(lag(ROC(close, type="discrete", n=10), 1)),
na.trim(lag(ROC(close, type="discrete", n=20), 1)),
na.trim(lag(ROC(close, type="discrete", n=50), 1)),
all=FALSE)
# Add mean, median, sd, mad, skew and kurtosis
res = merge(res,
xts(na.trim(lag(rollmean(rets, k=21, align="right"),1))),
xts(na.trim(lag(rollmedian(rets, k=21, align="right"),1))),
xts(na.trim(lag(rollapply(rets, width=21, align="right", FUN=sd),1))),
xts(na.trim(lag(rollapply(rets, width=21, align="right", FUN=mad),1))),
xts(na.trim(lag(rollapply(rets, width=21, align="right", FUN=skewness),1))),
xts(na.trim(lag(rollapply(rets, width=21, align="right", FUN=kurtosis),1))),
all=FALSE)
# Add volume with a lag of two
res = merge(res, xts(na.trim(lag(Vo(series),2))), all=FALSE)
colnames(res) = c("ROC.1", "ROC.2", "ROC.3", "ROC.5", "ROC.10", "ROC.20", "ROC.50",
"MEAN", "MEDIAN", "SD", "MAD", "SKEW", "KURTOSIS",
"VOLUME")
return(res)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment