Skip to content

Instantly share code, notes, and snippets.

@stefanjwojcik
Created December 31, 2020 15:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stefanjwojcik/444e9a25c11eb96f382696e94c88743d to your computer and use it in GitHub Desktop.
Save stefanjwojcik/444e9a25c11eb96f382696e94c88743d to your computer and use it in GitHub Desktop.
Playing around with forecasting gains in SP 500 time series data with XGBoost classifiers
using AlphaVantage
using DataFrames
using StatPlots
using Dates
using TimeSeries
using Statistics
using MLJ
using LossFunctions
## Decide on some time series to model:
# - or something like the E-S&P, where one could
# - create target as the % change in the market
# - create lags for 365 days, which will be fed into the lasso
# - create binaries for days of the week and months, which will be fed into the lasso
# - decide on which pct change to model
# - use a lasso to select proper lags to estimate the likelihood of significant increase
# - use data at a day scale to make estimates
# - use purged cross-validation
# - goal is to show mean-reversion properties after controlling for trend
#gr(size=(800,470))
# Get daily S&P 500 data
spy = time_series_daily("SPY", outputsize="full", datatype="csv");
# Convert to a DataFrame
data = DataFrame(spy[1]);
# Add column names
data = DataFrames.rename(data, Symbol.(vcat(spy[2]...)));
# Convert timestamp column to Date type
data[!, :timestamp] = Dates.Date.(data[!, :timestamp]);
data[!, :close] = Float64.(data[!, :close])
data[!, :volume] = Int64.(data[!, :volume])
# create a timearray
ta = TimeArray(data[!, [:timestamp, :close, :volume]]; timestamp=:timestamp)
# Subtract the values of prior to get the de-seasoned values
# or use the seasonal values to predict the effect of seasons
## Get the percent change in each measure
pch = percentchange(ta, padding=true)
# Extend lag function to return an array of lags of arbitrary size
rangelag = function(ta::TimeArray{Float64,1,Date,Array{Float64,1}}, lagrange::UnitRange{Int64}; padding=true)
n_lags = maximum(lagrange)
lagarray = zeros(size(ta)[1], n_lags)
# create lag array - using lags up to thirty
for x in lagrange
lagarray[:, x] .= values(lag(ta, x, padding=padding))
end
# the replacement name
string_name = string(colnames(ta)[1])
# The final result
range_lagarray = lagarray |>
(df -> DataFrame(df, Symbol.([string_name*"_l$x" for x in 1:size(lagarray)[2]]))) |>
(df -> insertcols!(df, 1, :timestamp => timestamp(ta)))
lagarray_ts = TimeArray(range_lagarray, timestamp = :timestamp)
return lagarray_ts
end
## Create lags of closing value
closing_lags = rangelag(pch[:close], 1:365)
## Create lags of volume
volume_lags = rangelag(pch[:volume], 1:365)
# Get the seasonal component for day of week, and the month
# create a bunch of binaries for each day of the week and calendar month 0
get_weekly_binaries = function(ta::TimeArray)
weekday_names = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]
weekday_dates = [ timestamp(when(ta, dayname, "$day")) for day in weekday_names]
week_binaries = [timestamp(ta) .∈ (weekday_dates[x], ) for x in 1:size(weekday_dates)[1]]
useable_array = hcat(week_binaries...)
return useable_array
end
## Monthly binaries
get_monthly_binaries = function(ta::TimeArray)
month_names = [Dates.monthname(x) for x in 1:12]
month_dates = [timestamp(when(ta, monthname, "$mo")) for mo in month_names]
month_binaries = [timestamp(ta) .∈ (month_dates[x], ) for x in 1:size(month_dates)[1]]
useable_array = hcat(month_binaries...)
return useable_array
end
#Create the actual weekly binaries
seasonal_weeklies = get_weekly_binaries(pch)
seasonal_weeklies = DataFrame(seasonal_weeklies, [Dates.dayname(x) for x in 1:5])
seasonal_monthlies = get_monthly_binaries(pch)
seasonal_monthlies = DataFrame(seasonal_monthlies, [Dates.monthname(x) for x in 1:12])
## Create the final target, based on the distribution of percent_change
closing_quantiles = quantile( DataFrame(dropnan(pch)).close, [.25, .75])
gen_target = function(target_array, quants)
res = []
for x in target_array
if (x <= quants[1])
push!(res, "25th")
elseif (x <= quants[2])
push!(res, "50th")
else
push!(res, "75th")
end
end
return res
end
my_target = gen_target(values(pch.close), closing_quantiles)
my_target = DataFrame(my_target = string.(my_target))
## Collect everything into a dataframe for ML
df_full = hcat(my_target, seasonal_monthlies, seasonal_weeklies,
DataFrame(closing_lags)[:, 2:end],
DataFrame(volume_lags)[:, 2:end])
df_full.timestamp = timestamp(closing_lags)
df_full.my_target = categorical(df_full.my_target)
## Filter out the missing values
df_full_nomissing =
filter(:close_l365 => x -> !any(f -> f(x), (ismissing, isnothing, isnan)), df_full) |>
(df -> select(df, Not(:timestamp))) |>
(df -> coerce(df, Count=>Continuous))
## Create outcome matrix
fullY, fullX = unpack(df_full_nomissing, ==(:my_target), colname -> true)
# Create training and testing data
trainrows, testrows = partition(1:nrow(df_full_nomissing), .7)
# Specify train test pairs
fold1 = 1:6; fold2 = 7:12;
# write a function to do windowed cross-validation
gen_ts_cv_pairs = function(trainsize, window_size)
cv_pairs = [] #output object
for it in 1:(trainsize-window_size*2)
trainset = it:window_size+it
testset = window_size+it+1:window_size*2+it
push!(cv_pairs, (trainset, testset))
end
return cv_pairs
end
# Create resampling pairs
#resampling_pairs = Tuple.(gen_ts_cv_pairs(length(trainrows), 800))
resampling_pairs = Tuple.([ (1:2312, 2313:3470), (1156:3470, 1:1155) ])
##
### 20 meta models
@load XGBoostClassifier()
xgb = XGBoostClassifier()
r_eta = range(xgb, :eta, lower=.01, upper=.4)
tm = TunedModel(model=xgb, tuning=Grid(resolution=20),
resampling=CV(nfolds=5, shuffle=true, rng=1234), ranges=r_eta,
measure=cross_entropy)
mtm = machine(tm, fullX, fullY)
fit!(mtm, rows=trainrows)
yhat = predict(mtm, rows=trainrows)
mce = cross_entropy(yhat, fullY[trainrows]) |> mean
accuracy(predict_mode(mtm, rows=testrows), fullY[testrows])
############### SVM Classifier
@load SVMLinearClassifier
svm = SVMLinearClassifier()
r_c = range(svm, :C, lower=.1, upper=1)
tm = TunedModel(model=svm, tuning=Grid(resolution=2),
resampling=resampling_pairs, ranges=r_c)
svtm = machine(svm, fullX, fullY)
fit!(svtm, rows=trainrows)
accuracy(predict(svtm, rows=testrows), fullY[testrows])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment