stefanjwojcik/Trading_Play.jl

## Trading_Play.jl
using AlphaVantage
using DataFrames
using StatPlots
using Dates
using TimeSeries
using Statistics
using MLJ
using LossFunctions


## Decide on some time series to model:
# - or something like the E-S&P, where one could
# - create target as the % change in the market
# - create lags for 365 days, which will be fed into the lasso
# - create binaries for days of the week and months, which will be fed into the lasso
# - decide on which pct change to model

# - use a lasso to select proper lags to estimate the likelihood of significant increase
# - use data at a day scale to make estimates
# - use purged cross-validation
# - goal is to show mean-reversion properties after controlling for trend

#gr(size=(800,470))
# Get daily S&P 500 data
spy = time_series_daily("SPY", outputsize="full", datatype="csv");
# Convert to a DataFrame
data = DataFrame(spy[1]);
# Add column names
data = DataFrames.rename(data, Symbol.(vcat(spy[2]...)));
# Convert timestamp column to Date type
data[!, :timestamp] = Dates.Date.(data[!, :timestamp]);
data[!, :close] = Float64.(data[!, :close])
data[!, :volume] = Int64.(data[!, :volume])

# create a timearray
ta = TimeArray(data[!, [:timestamp, :close, :volume]]; timestamp=:timestamp)

# Subtract the values of prior to get the de-seasoned values
# or use the seasonal values to predict the effect of seasons

## Get the percent change in each measure
pch = percentchange(ta, padding=true)

# Extend lag function to return an array of lags of arbitrary size
rangelag = function(ta::TimeArray{Float64,1,Date,Array{Float64,1}}, lagrange::UnitRange{Int64}; padding=true)
    n_lags = maximum(lagrange)
    lagarray = zeros(size(ta)[1], n_lags)
    # create lag array - using lags up to thirty
    for x in lagrange
        lagarray[:, x] .= values(lag(ta, x, padding=padding))
    end
    # the replacement name
    string_name = string(colnames(ta)[1])
    # The final result
    range_lagarray = lagarray |>
        (df -> DataFrame(df, Symbol.([string_name*"_l$x" for x in 1:size(lagarray)[2]]))) |>
        (df -> insertcols!(df, 1, :timestamp => timestamp(ta)))

    lagarray_ts = TimeArray(range_lagarray, timestamp = :timestamp)
    return lagarray_ts
end

## Create lags of closing value
closing_lags = rangelag(pch[:close], 1:365)

## Create lags of volume
volume_lags = rangelag(pch[:volume], 1:365)

# Get the seasonal component for day of week, and the month
    # create a bunch of binaries for each day of the week and calendar month 0
get_weekly_binaries = function(ta::TimeArray)
    weekday_names = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]
    weekday_dates = [ timestamp(when(ta, dayname, "$day")) for day in weekday_names]
    week_binaries = [timestamp(ta) .∈ (weekday_dates[x], ) for x in 1:size(weekday_dates)[1]]
    useable_array = hcat(week_binaries...)
    return useable_array
end

## Monthly binaries
get_monthly_binaries = function(ta::TimeArray)
    month_names = [Dates.monthname(x) for x in 1:12]
    month_dates = [timestamp(when(ta, monthname, "$mo")) for mo in month_names]
    month_binaries = [timestamp(ta) .∈ (month_dates[x], ) for x in 1:size(month_dates)[1]]
    useable_array = hcat(month_binaries...)
    return useable_array
end

    #Create the actual weekly binaries
seasonal_weeklies = get_weekly_binaries(pch)
seasonal_weeklies = DataFrame(seasonal_weeklies, [Dates.dayname(x) for x in 1:5])
seasonal_monthlies = get_monthly_binaries(pch)
seasonal_monthlies = DataFrame(seasonal_monthlies, [Dates.monthname(x) for x in 1:12])

## Create the final target, based on the distribution of percent_change
closing_quantiles = quantile( DataFrame(dropnan(pch)).close, [.25, .75])

gen_target = function(target_array, quants)
    res = []
    for x in target_array
        if (x <= quants[1])
            push!(res, "25th")
        elseif (x <= quants[2])
            push!(res, "50th")
        else
            push!(res, "75th")
        end
    end
    return res
end

my_target = gen_target(values(pch.close), closing_quantiles)
my_target = DataFrame(my_target = string.(my_target))

## Collect everything into a dataframe for ML
df_full = hcat(my_target, seasonal_monthlies, seasonal_weeklies,
               DataFrame(closing_lags)[:, 2:end],
               DataFrame(volume_lags)[:, 2:end])

df_full.timestamp = timestamp(closing_lags)
df_full.my_target = categorical(df_full.my_target)

## Filter out the missing values
df_full_nomissing =
    filter(:close_l365 => x -> !any(f -> f(x), (ismissing, isnothing, isnan)), df_full) |>
    (df -> select(df, Not(:timestamp))) |>
    (df -> coerce(df, Count=>Continuous))
## Create outcome matrix
fullY, fullX = unpack(df_full_nomissing, ==(:my_target), colname -> true)

# Create training and testing data
trainrows, testrows = partition(1:nrow(df_full_nomissing), .7)
# Specify train test pairs
fold1 = 1:6; fold2 = 7:12;
# write a function to do windowed cross-validation
gen_ts_cv_pairs = function(trainsize, window_size)
    cv_pairs = [] #output object
    for it in 1:(trainsize-window_size*2)
        trainset = it:window_size+it
        testset = window_size+it+1:window_size*2+it
        push!(cv_pairs, (trainset, testset))
    end
    return cv_pairs
end

# Create resampling pairs
#resampling_pairs = Tuple.(gen_ts_cv_pairs(length(trainrows), 800))
resampling_pairs = Tuple.([ (1:2312, 2313:3470), (1156:3470, 1:1155) ])
##

### 20 meta models
@load XGBoostClassifier()
xgb = XGBoostClassifier()
r_eta = range(xgb, :eta, lower=.01, upper=.4)

tm = TunedModel(model=xgb, tuning=Grid(resolution=20),
            resampling=CV(nfolds=5, shuffle=true, rng=1234), ranges=r_eta,
            measure=cross_entropy)

mtm = machine(tm, fullX, fullY)
fit!(mtm, rows=trainrows)

yhat = predict(mtm, rows=trainrows)
mce = cross_entropy(yhat, fullY[trainrows]) |> mean
accuracy(predict_mode(mtm, rows=testrows), fullY[testrows])

############### SVM Classifier
@load SVMLinearClassifier
svm = SVMLinearClassifier()

r_c = range(svm, :C, lower=.1, upper=1)

tm = TunedModel(model=svm, tuning=Grid(resolution=2),
            resampling=resampling_pairs, ranges=r_c)

svtm = machine(svm, fullX, fullY)
fit!(svtm, rows=trainrows)
accuracy(predict(svtm, rows=testrows), fullY[testrows])
	using AlphaVantage
	using DataFrames
	using StatPlots
	using Dates
	using TimeSeries
	using Statistics
	using MLJ
	using LossFunctions


	## Decide on some time series to model:
	# - or something like the E-S&P, where one could
	# - create target as the % change in the market
	# - create lags for 365 days, which will be fed into the lasso
	# - create binaries for days of the week and months, which will be fed into the lasso
	# - decide on which pct change to model

	# - use a lasso to select proper lags to estimate the likelihood of significant increase
	# - use data at a day scale to make estimates
	# - use purged cross-validation
	# - goal is to show mean-reversion properties after controlling for trend

	#gr(size=(800,470))
	# Get daily S&P 500 data
	spy = time_series_daily("SPY", outputsize="full", datatype="csv");
	# Convert to a DataFrame
	data = DataFrame(spy[1]);
	# Add column names
	data = DataFrames.rename(data, Symbol.(vcat(spy[2]...)));
	# Convert timestamp column to Date type
	data[!, :timestamp] = Dates.Date.(data[!, :timestamp]);
	data[!, :close] = Float64.(data[!, :close])
	data[!, :volume] = Int64.(data[!, :volume])

	# create a timearray
	ta = TimeArray(data[!, [:timestamp, :close, :volume]]; timestamp=:timestamp)

	# Subtract the values of prior to get the de-seasoned values
	# or use the seasonal values to predict the effect of seasons

	## Get the percent change in each measure
	pch = percentchange(ta, padding=true)

	# Extend lag function to return an array of lags of arbitrary size
	rangelag = function(ta::TimeArray{Float64,1,Date,Array{Float64,1}}, lagrange::UnitRange{Int64}; padding=true)
	n_lags = maximum(lagrange)
	lagarray = zeros(size(ta)[1], n_lags)
	# create lag array - using lags up to thirty
	for x in lagrange
	lagarray[:, x] .= values(lag(ta, x, padding=padding))
	end
	# the replacement name
	string_name = string(colnames(ta)[1])
	# The final result
	range_lagarray = lagarray \|>
	(df -> DataFrame(df, Symbol.([string_name*"_l$x" for x in 1:size(lagarray)[2]]))) \|>
	(df -> insertcols!(df, 1, :timestamp => timestamp(ta)))

	lagarray_ts = TimeArray(range_lagarray, timestamp = :timestamp)
	return lagarray_ts
	end

	## Create lags of closing value
	closing_lags = rangelag(pch[:close], 1:365)

	## Create lags of volume
	volume_lags = rangelag(pch[:volume], 1:365)

	# Get the seasonal component for day of week, and the month
	# create a bunch of binaries for each day of the week and calendar month 0
	get_weekly_binaries = function(ta::TimeArray)
	weekday_names = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]
	weekday_dates = [ timestamp(when(ta, dayname, "$day")) for day in weekday_names]
	week_binaries = [timestamp(ta) .∈ (weekday_dates[x], ) for x in 1:size(weekday_dates)[1]]
	useable_array = hcat(week_binaries...)
	return useable_array
	end

	## Monthly binaries
	get_monthly_binaries = function(ta::TimeArray)
	month_names = [Dates.monthname(x) for x in 1:12]
	month_dates = [timestamp(when(ta, monthname, "$mo")) for mo in month_names]
	month_binaries = [timestamp(ta) .∈ (month_dates[x], ) for x in 1:size(month_dates)[1]]
	useable_array = hcat(month_binaries...)
	return useable_array
	end

	#Create the actual weekly binaries
	seasonal_weeklies = get_weekly_binaries(pch)
	seasonal_weeklies = DataFrame(seasonal_weeklies, [Dates.dayname(x) for x in 1:5])
	seasonal_monthlies = get_monthly_binaries(pch)
	seasonal_monthlies = DataFrame(seasonal_monthlies, [Dates.monthname(x) for x in 1:12])

	## Create the final target, based on the distribution of percent_change
	closing_quantiles = quantile( DataFrame(dropnan(pch)).close, [.25, .75])

	gen_target = function(target_array, quants)
	res = []
	for x in target_array
	if (x <= quants[1])
	push!(res, "25th")
	elseif (x <= quants[2])
	push!(res, "50th")
	else
	push!(res, "75th")
	end
	end
	return res
	end

	my_target = gen_target(values(pch.close), closing_quantiles)
	my_target = DataFrame(my_target = string.(my_target))

	## Collect everything into a dataframe for ML
	df_full = hcat(my_target, seasonal_monthlies, seasonal_weeklies,
	DataFrame(closing_lags)[:, 2:end],
	DataFrame(volume_lags)[:, 2:end])

	df_full.timestamp = timestamp(closing_lags)
	df_full.my_target = categorical(df_full.my_target)

	## Filter out the missing values
	df_full_nomissing =
	filter(:close_l365 => x -> !any(f -> f(x), (ismissing, isnothing, isnan)), df_full) \|>
	(df -> select(df, Not(:timestamp))) \|>
	(df -> coerce(df, Count=>Continuous))
	## Create outcome matrix
	fullY, fullX = unpack(df_full_nomissing, ==(:my_target), colname -> true)

	# Create training and testing data
	trainrows, testrows = partition(1:nrow(df_full_nomissing), .7)
	# Specify train test pairs
	fold1 = 1:6; fold2 = 7:12;
	# write a function to do windowed cross-validation
	gen_ts_cv_pairs = function(trainsize, window_size)
	cv_pairs = [] #output object
	for it in 1:(trainsize-window_size*2)
	trainset = it:window_size+it
	testset = window_size+it+1:window_size*2+it
	push!(cv_pairs, (trainset, testset))
	end
	return cv_pairs
	end

	# Create resampling pairs
	#resampling_pairs = Tuple.(gen_ts_cv_pairs(length(trainrows), 800))
	resampling_pairs = Tuple.([ (1:2312, 2313:3470), (1156:3470, 1:1155) ])
	##

	### 20 meta models
	@load XGBoostClassifier()
	xgb = XGBoostClassifier()
	r_eta = range(xgb, :eta, lower=.01, upper=.4)

	tm = TunedModel(model=xgb, tuning=Grid(resolution=20),
	resampling=CV(nfolds=5, shuffle=true, rng=1234), ranges=r_eta,
	measure=cross_entropy)

	mtm = machine(tm, fullX, fullY)
	fit!(mtm, rows=trainrows)

	yhat = predict(mtm, rows=trainrows)
	mce = cross_entropy(yhat, fullY[trainrows]) \|> mean
	accuracy(predict_mode(mtm, rows=testrows), fullY[testrows])

	############### SVM Classifier
	@load SVMLinearClassifier
	svm = SVMLinearClassifier()

	r_c = range(svm, :C, lower=.1, upper=1)

	tm = TunedModel(model=svm, tuning=Grid(resolution=2),
	resampling=resampling_pairs, ranges=r_c)

	svtm = machine(svm, fullX, fullY)
	fit!(svtm, rows=trainrows)
	accuracy(predict(svtm, rows=testrows), fullY[testrows])