Created
December 31, 2020 15:04
-
-
Save stefanjwojcik/444e9a25c11eb96f382696e94c88743d to your computer and use it in GitHub Desktop.
Playing around with forecasting gains in SP 500 time series data with XGBoost classifiers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using AlphaVantage | |
using DataFrames | |
using StatPlots | |
using Dates | |
using TimeSeries | |
using Statistics | |
using MLJ | |
using LossFunctions | |
## Decide on some time series to model: | |
# - or something like the E-S&P, where one could | |
# - create target as the % change in the market | |
# - create lags for 365 days, which will be fed into the lasso | |
# - create binaries for days of the week and months, which will be fed into the lasso | |
# - decide on which pct change to model | |
# - use a lasso to select proper lags to estimate the likelihood of significant increase | |
# - use data at a day scale to make estimates | |
# - use purged cross-validation | |
# - goal is to show mean-reversion properties after controlling for trend | |
#gr(size=(800,470)) | |
# Get daily S&P 500 data | |
spy = time_series_daily("SPY", outputsize="full", datatype="csv"); | |
# Convert to a DataFrame | |
data = DataFrame(spy[1]); | |
# Add column names | |
data = DataFrames.rename(data, Symbol.(vcat(spy[2]...))); | |
# Convert timestamp column to Date type | |
data[!, :timestamp] = Dates.Date.(data[!, :timestamp]); | |
data[!, :close] = Float64.(data[!, :close]) | |
data[!, :volume] = Int64.(data[!, :volume]) | |
# create a timearray | |
ta = TimeArray(data[!, [:timestamp, :close, :volume]]; timestamp=:timestamp) | |
# Subtract the values of prior to get the de-seasoned values | |
# or use the seasonal values to predict the effect of seasons | |
## Get the percent change in each measure | |
pch = percentchange(ta, padding=true) | |
# Extend lag function to return an array of lags of arbitrary size | |
rangelag = function(ta::TimeArray{Float64,1,Date,Array{Float64,1}}, lagrange::UnitRange{Int64}; padding=true) | |
n_lags = maximum(lagrange) | |
lagarray = zeros(size(ta)[1], n_lags) | |
# create lag array - using lags up to thirty | |
for x in lagrange | |
lagarray[:, x] .= values(lag(ta, x, padding=padding)) | |
end | |
# the replacement name | |
string_name = string(colnames(ta)[1]) | |
# The final result | |
range_lagarray = lagarray |> | |
(df -> DataFrame(df, Symbol.([string_name*"_l$x" for x in 1:size(lagarray)[2]]))) |> | |
(df -> insertcols!(df, 1, :timestamp => timestamp(ta))) | |
lagarray_ts = TimeArray(range_lagarray, timestamp = :timestamp) | |
return lagarray_ts | |
end | |
## Create lags of closing value | |
closing_lags = rangelag(pch[:close], 1:365) | |
## Create lags of volume | |
volume_lags = rangelag(pch[:volume], 1:365) | |
# Get the seasonal component for day of week, and the month | |
# create a bunch of binaries for each day of the week and calendar month 0 | |
get_weekly_binaries = function(ta::TimeArray) | |
weekday_names = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"] | |
weekday_dates = [ timestamp(when(ta, dayname, "$day")) for day in weekday_names] | |
week_binaries = [timestamp(ta) .∈ (weekday_dates[x], ) for x in 1:size(weekday_dates)[1]] | |
useable_array = hcat(week_binaries...) | |
return useable_array | |
end | |
## Monthly binaries | |
get_monthly_binaries = function(ta::TimeArray) | |
month_names = [Dates.monthname(x) for x in 1:12] | |
month_dates = [timestamp(when(ta, monthname, "$mo")) for mo in month_names] | |
month_binaries = [timestamp(ta) .∈ (month_dates[x], ) for x in 1:size(month_dates)[1]] | |
useable_array = hcat(month_binaries...) | |
return useable_array | |
end | |
#Create the actual weekly binaries | |
seasonal_weeklies = get_weekly_binaries(pch) | |
seasonal_weeklies = DataFrame(seasonal_weeklies, [Dates.dayname(x) for x in 1:5]) | |
seasonal_monthlies = get_monthly_binaries(pch) | |
seasonal_monthlies = DataFrame(seasonal_monthlies, [Dates.monthname(x) for x in 1:12]) | |
## Create the final target, based on the distribution of percent_change | |
closing_quantiles = quantile( DataFrame(dropnan(pch)).close, [.25, .75]) | |
gen_target = function(target_array, quants) | |
res = [] | |
for x in target_array | |
if (x <= quants[1]) | |
push!(res, "25th") | |
elseif (x <= quants[2]) | |
push!(res, "50th") | |
else | |
push!(res, "75th") | |
end | |
end | |
return res | |
end | |
my_target = gen_target(values(pch.close), closing_quantiles) | |
my_target = DataFrame(my_target = string.(my_target)) | |
## Collect everything into a dataframe for ML | |
df_full = hcat(my_target, seasonal_monthlies, seasonal_weeklies, | |
DataFrame(closing_lags)[:, 2:end], | |
DataFrame(volume_lags)[:, 2:end]) | |
df_full.timestamp = timestamp(closing_lags) | |
df_full.my_target = categorical(df_full.my_target) | |
## Filter out the missing values | |
df_full_nomissing = | |
filter(:close_l365 => x -> !any(f -> f(x), (ismissing, isnothing, isnan)), df_full) |> | |
(df -> select(df, Not(:timestamp))) |> | |
(df -> coerce(df, Count=>Continuous)) | |
## Create outcome matrix | |
fullY, fullX = unpack(df_full_nomissing, ==(:my_target), colname -> true) | |
# Create training and testing data | |
trainrows, testrows = partition(1:nrow(df_full_nomissing), .7) | |
# Specify train test pairs | |
fold1 = 1:6; fold2 = 7:12; | |
# write a function to do windowed cross-validation | |
gen_ts_cv_pairs = function(trainsize, window_size) | |
cv_pairs = [] #output object | |
for it in 1:(trainsize-window_size*2) | |
trainset = it:window_size+it | |
testset = window_size+it+1:window_size*2+it | |
push!(cv_pairs, (trainset, testset)) | |
end | |
return cv_pairs | |
end | |
# Create resampling pairs | |
#resampling_pairs = Tuple.(gen_ts_cv_pairs(length(trainrows), 800)) | |
resampling_pairs = Tuple.([ (1:2312, 2313:3470), (1156:3470, 1:1155) ]) | |
## | |
### 20 meta models | |
@load XGBoostClassifier() | |
xgb = XGBoostClassifier() | |
r_eta = range(xgb, :eta, lower=.01, upper=.4) | |
tm = TunedModel(model=xgb, tuning=Grid(resolution=20), | |
resampling=CV(nfolds=5, shuffle=true, rng=1234), ranges=r_eta, | |
measure=cross_entropy) | |
mtm = machine(tm, fullX, fullY) | |
fit!(mtm, rows=trainrows) | |
yhat = predict(mtm, rows=trainrows) | |
mce = cross_entropy(yhat, fullY[trainrows]) |> mean | |
accuracy(predict_mode(mtm, rows=testrows), fullY[testrows]) | |
############### SVM Classifier | |
@load SVMLinearClassifier | |
svm = SVMLinearClassifier() | |
r_c = range(svm, :C, lower=.1, upper=1) | |
tm = TunedModel(model=svm, tuning=Grid(resolution=2), | |
resampling=resampling_pairs, ranges=r_c) | |
svtm = machine(svm, fullX, fullY) | |
fit!(svtm, rows=trainrows) | |
accuracy(predict(svtm, rows=testrows), fullY[testrows]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment