GrovesD2/price_predictor_example.py

## price_predictor_example.py
'''
Simple Neural Network attempt for price prediction - it does not do a good job
at all and therefore SHOULD NOT BE USED FOR ANY REAL TRADING/INVESTING
'''

import numba as nb
import numpy as np
import pandas as pd
import yfinance as yf

from typing import Tuple

# NN imports
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Imports for evaluating the network
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Global variables
EPOCHS = 10
BATCH_SIZE = 8
VALIDATION = 0.1
LEARN_RATE = 1e-3
TRAIN_SPLIT = 0.8
FEAT_LENGTH = 25
FEAT_COLS = ['Open', 'Low', 'High', 'Close', 'Volume']
TICKERS = ['SPY', 'TSLA', 'AAPL', 'GOOG', 'AMZN']


def time_series(
        df: pd.DataFrame,
        col: str,
        name: str,
    ) -> pd.DataFrame:
    '''
    Form the lagged columns for this feature
    '''
    return df.assign(**{
        f'{name}_t-{lag}': col.shift(lag)
        for lag in range(0, FEAT_LENGTH)
    })


def get_lagged_returns(df: pd.DataFrame) -> pd.DataFrame:
    '''
    For each of the feature cols, find the returns and then form the lagged
    time-series as new columns
    '''
    for col in FEAT_COLS:
        return_col = df[col]/df[col].shift(1)-1
        df = time_series(df, return_col, f'feat_{col}_ret')

    return df


def get_classification(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Get the classifications for the NN, which are as follows:
        0 = The closing price goes down tomorrow
        1 = The closing price goes up tomorrow
    '''

    df['close_tomorrow'] = df['Close'].shift(-1)

    conditions = [
        df['close_tomorrow'] <= df['Close'],
        df['close_tomorrow'] > df['Close'],
    ]

    df['classification'] = np.select(
        condlist = conditions,
        choicelist = [0, 1],
    )

    return df


@nb.jit(nopython = True)
def explicit_heat_smooth(vals: np.array, t_end: float) -> np.array:

    k = 0.1 # Time spacing

    # Set up the initial condition
    P = vals

    t = 0
    while t < t_end:
        P = k*(P[2:] + P[:-2]) + P[1:-1]*(1-2*k)
        P = np.hstack((
            np.array([vals[0]]),
            P,
            np.array([vals[-1]]),
        ))
        t += k

    return P


def apply_smoothening(vals: np.array) -> np.array:
    '''
    Apply the time series smoothening to all feature columns
    '''

    for feat in range(vals.shape[0]):
        for idx in range(0, FEAT_LENGTH*len(FEAT_COLS), FEAT_LENGTH):
            vals[feat, idx:idx+FEAT_LENGTH] = explicit_heat_smooth(
                vals = vals[feat, idx:idx+FEAT_LENGTH],
                t_end = 5,
            )

    return vals


def get_nn_data() -> Tuple[np.array, np.array, np.array, np.array]:
    '''
    For all tickers, deduce the NN features and classifications, and then save
    the outputs as four numpy arrays (x_train, y_train, x_test, y_test)
    '''

    dfs = []
    for ticker in TICKERS:

        df = yf.download(ticker).reset_index()

        df = get_lagged_returns(df)
        df = get_classification(df)

        # We may end up with some divisions by 0 when calculating the returns
        # so to prevent any rows with this slipping in, we replace any infs
        # with nan values and remove all rows with nan values in them
        dfs.append(
            df
            .replace([np.inf, -np.inf], np.nan)
            .dropna()
            [[col for col in df.columns if 'feat_' in col] + ['classification']]
        )

    nn_values = pd.concat(dfs).values
    x_values = apply_smoothening(nn_values[:, :-1])
    y_values = nn_values[:, -1]

    # Shuffle the values to ensure the NN does not learn an order
    np.random.shuffle(nn_values)

    # Split into training and test data
    split_idx = int(TRAIN_SPLIT*nn_values.shape[0])

    return (
        x_values[0:split_idx, :], # x_train
        y_values[0:split_idx:], # y_train
        x_values[split_idx:, :], # x_test
        y_values[split_idx:], # y_test
    )


def get_model(x_train: np.array) -> Sequential:
    '''
    Generate the NN model that we are going to train
    '''
    return Sequential([
        Dense(128, input_shape = (x_train.shape[1], )),
        Dense(64),
        Dense(64),
        Dense(1, activation = 'sigmoid'),
    ])


def evaluate_training(
        model: Sequential,
        x_test: np.array,
        y_test: np.array
    ):
    '''
    Produce confusion matrices to evaluate the training on the testing data.
    '''

    score = model.evaluate(
        x_test,
        y_test,
        verbose = 0,
    )

    print("Test loss:", score[0])
    print("Test accuracy:", score[1])

    pred = model.predict(x_test)
    pred[pred >= 0.5] = 1
    pred[pred < 0.5] = 0

    cm = confusion_matrix(
        y_true = y_test,
        y_pred = pred,
    )

    # The scaled confusion matrix gives a view where each column is scaled
    # by the total sum of elements in that column
    cm_scaled = cm/cm.astype(np.float).sum(axis = 0)

    unscaled = ConfusionMatrixDisplay(confusion_matrix = cm)
    unscaled.plot()
    unscaled.ax_.set_title('Unscaled confusion matrix')

    scaled = ConfusionMatrixDisplay(confusion_matrix = cm_scaled)
    scaled.plot()
    scaled.ax_.set_title('Scaled confusion matrix')

    return


if __name__ == '__main__':

    x_train, y_train, x_test, y_test = get_nn_data()

    model = get_model(x_train)

    model.compile(
        loss = 'binary_crossentropy',
        optimizer = Adam(learning_rate = LEARN_RATE),
        metrics = ['accuracy']
    )

    model.fit(
        x_train,
        y_train,
        epochs = EPOCHS,
        batch_size = BATCH_SIZE,
        validation_split = VALIDATION,
    )

    evaluate_training(model, x_test, y_test)
	'''
	Simple Neural Network attempt for price prediction - it does not do a good job
	at all and therefore SHOULD NOT BE USED FOR ANY REAL TRADING/INVESTING
	'''

	import numba as nb
	import numpy as np
	import pandas as pd
	import yfinance as yf

	from typing import Tuple

	# NN imports
	from tensorflow.keras.optimizers import Adam
	from tensorflow.keras.models import Sequential
	from tensorflow.keras.layers import Dense

	# Imports for evaluating the network
	from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

	# Global variables
	EPOCHS = 10
	BATCH_SIZE = 8
	VALIDATION = 0.1
	LEARN_RATE = 1e-3
	TRAIN_SPLIT = 0.8
	FEAT_LENGTH = 25
	FEAT_COLS = ['Open', 'Low', 'High', 'Close', 'Volume']
	TICKERS = ['SPY', 'TSLA', 'AAPL', 'GOOG', 'AMZN']


	def time_series(
	df: pd.DataFrame,
	col: str,
	name: str,
	) -> pd.DataFrame:
	'''
	Form the lagged columns for this feature
	'''
	return df.assign(**{
	f'{name}_t-{lag}': col.shift(lag)
	for lag in range(0, FEAT_LENGTH)
	})


	def get_lagged_returns(df: pd.DataFrame) -> pd.DataFrame:
	'''
	For each of the feature cols, find the returns and then form the lagged
	time-series as new columns
	'''
	for col in FEAT_COLS:
	return_col = df[col]/df[col].shift(1)-1
	df = time_series(df, return_col, f'feat_{col}_ret')

	return df


	def get_classification(df: pd.DataFrame) -> pd.DataFrame:
	'''
	Get the classifications for the NN, which are as follows:
	0 = The closing price goes down tomorrow
	1 = The closing price goes up tomorrow
	'''

	df['close_tomorrow'] = df['Close'].shift(-1)

	conditions = [
	df['close_tomorrow'] <= df['Close'],
	df['close_tomorrow'] > df['Close'],
	]

	df['classification'] = np.select(
	condlist = conditions,
	choicelist = [0, 1],
	)

	return df


	@nb.jit(nopython = True)
	def explicit_heat_smooth(vals: np.array, t_end: float) -> np.array:

	k = 0.1 # Time spacing

	# Set up the initial condition
	P = vals

	t = 0
	while t < t_end:
	P = k(P[2:] + P[:-2]) + P[1:-1](1-2*k)
	P = np.hstack((
	np.array([vals[0]]),
	P,
	np.array([vals[-1]]),
	))
	t += k

	return P


	def apply_smoothening(vals: np.array) -> np.array:
	'''
	Apply the time series smoothening to all feature columns
	'''

	for feat in range(vals.shape[0]):
	for idx in range(0, FEAT_LENGTH*len(FEAT_COLS), FEAT_LENGTH):
	vals[feat, idx:idx+FEAT_LENGTH] = explicit_heat_smooth(
	vals = vals[feat, idx:idx+FEAT_LENGTH],
	t_end = 5,
	)

	return vals


	def get_nn_data() -> Tuple[np.array, np.array, np.array, np.array]:
	'''
	For all tickers, deduce the NN features and classifications, and then save
	the outputs as four numpy arrays (x_train, y_train, x_test, y_test)
	'''

	dfs = []
	for ticker in TICKERS:

	df = yf.download(ticker).reset_index()

	df = get_lagged_returns(df)
	df = get_classification(df)

	# We may end up with some divisions by 0 when calculating the returns
	# so to prevent any rows with this slipping in, we replace any infs
	# with nan values and remove all rows with nan values in them
	dfs.append(
	df
	.replace([np.inf, -np.inf], np.nan)
	.dropna()
	[[col for col in df.columns if 'feat_' in col] + ['classification']]
	)

	nn_values = pd.concat(dfs).values
	x_values = apply_smoothening(nn_values[:, :-1])
	y_values = nn_values[:, -1]

	# Shuffle the values to ensure the NN does not learn an order
	np.random.shuffle(nn_values)

	# Split into training and test data
	split_idx = int(TRAIN_SPLIT*nn_values.shape[0])

	return (
	x_values[0:split_idx, :], # x_train
	y_values[0:split_idx:], # y_train
	x_values[split_idx:, :], # x_test
	y_values[split_idx:], # y_test
	)


	def get_model(x_train: np.array) -> Sequential:
	'''
	Generate the NN model that we are going to train
	'''
	return Sequential([
	Dense(128, input_shape = (x_train.shape[1], )),
	Dense(64),
	Dense(64),
	Dense(1, activation = 'sigmoid'),
	])


	def evaluate_training(
	model: Sequential,
	x_test: np.array,
	y_test: np.array
	):
	'''
	Produce confusion matrices to evaluate the training on the testing data.
	'''

	score = model.evaluate(
	x_test,
	y_test,
	verbose = 0,
	)

	print("Test loss:", score[0])
	print("Test accuracy:", score[1])

	pred = model.predict(x_test)
	pred[pred >= 0.5] = 1
	pred[pred < 0.5] = 0

	cm = confusion_matrix(
	y_true = y_test,
	y_pred = pred,
	)

	# The scaled confusion matrix gives a view where each column is scaled
	# by the total sum of elements in that column
	cm_scaled = cm/cm.astype(np.float).sum(axis = 0)

	unscaled = ConfusionMatrixDisplay(confusion_matrix = cm)
	unscaled.plot()
	unscaled.ax_.set_title('Unscaled confusion matrix')

	scaled = ConfusionMatrixDisplay(confusion_matrix = cm_scaled)
	scaled.plot()
	scaled.ax_.set_title('Scaled confusion matrix')

	return


	if __name__ == '__main__':

	x_train, y_train, x_test, y_test = get_nn_data()

	model = get_model(x_train)

	model.compile(
	loss = 'binary_crossentropy',
	optimizer = Adam(learning_rate = LEARN_RATE),
	metrics = ['accuracy']
	)

	model.fit(
	x_train,
	y_train,
	epochs = EPOCHS,
	batch_size = BATCH_SIZE,
	validation_split = VALIDATION,
	)

	evaluate_training(model, x_test, y_test)