Skip to content

Instantly share code, notes, and snippets.

@GrovesD2
Created December 11, 2022 18:57
Show Gist options
  • Save GrovesD2/cffc435e0972d6d4f7b345af8e318b25 to your computer and use it in GitHub Desktop.
Save GrovesD2/cffc435e0972d6d4f7b345af8e318b25 to your computer and use it in GitHub Desktop.
'''
Simple Neural Network attempt for price prediction - it does not do a good job
at all and therefore SHOULD NOT BE USED FOR ANY REAL TRADING/INVESTING
'''
import numba as nb
import numpy as np
import pandas as pd
import yfinance as yf
from typing import Tuple
# NN imports
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
# Imports for evaluating the network
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# Global variables
EPOCHS = 10
BATCH_SIZE = 8
VALIDATION = 0.1
LEARN_RATE = 1e-3
TRAIN_SPLIT = 0.8
FEAT_LENGTH = 25
FEAT_COLS = ['Open', 'Low', 'High', 'Close', 'Volume']
TICKERS = ['SPY', 'TSLA', 'AAPL', 'GOOG', 'AMZN']
def time_series(
df: pd.DataFrame,
col: str,
name: str,
) -> pd.DataFrame:
'''
Form the lagged columns for this feature
'''
return df.assign(**{
f'{name}_t-{lag}': col.shift(lag)
for lag in range(0, FEAT_LENGTH)
})
def get_lagged_returns(df: pd.DataFrame) -> pd.DataFrame:
'''
For each of the feature cols, find the returns and then form the lagged
time-series as new columns
'''
for col in FEAT_COLS:
return_col = df[col]/df[col].shift(1)-1
df = time_series(df, return_col, f'feat_{col}_ret')
return df
def get_classification(df: pd.DataFrame) -> pd.DataFrame:
'''
Get the classifications for the NN, which are as follows:
0 = The closing price goes down tomorrow
1 = The closing price goes up tomorrow
'''
df['close_tomorrow'] = df['Close'].shift(-1)
conditions = [
df['close_tomorrow'] <= df['Close'],
df['close_tomorrow'] > df['Close'],
]
df['classification'] = np.select(
condlist = conditions,
choicelist = [0, 1],
)
return df
@nb.jit(nopython = True)
def explicit_heat_smooth(vals: np.array, t_end: float) -> np.array:
k = 0.1 # Time spacing
# Set up the initial condition
P = vals
t = 0
while t < t_end:
P = k*(P[2:] + P[:-2]) + P[1:-1]*(1-2*k)
P = np.hstack((
np.array([vals[0]]),
P,
np.array([vals[-1]]),
))
t += k
return P
def apply_smoothening(vals: np.array) -> np.array:
'''
Apply the time series smoothening to all feature columns
'''
for feat in range(vals.shape[0]):
for idx in range(0, FEAT_LENGTH*len(FEAT_COLS), FEAT_LENGTH):
vals[feat, idx:idx+FEAT_LENGTH] = explicit_heat_smooth(
vals = vals[feat, idx:idx+FEAT_LENGTH],
t_end = 5,
)
return vals
def get_nn_data() -> Tuple[np.array, np.array, np.array, np.array]:
'''
For all tickers, deduce the NN features and classifications, and then save
the outputs as four numpy arrays (x_train, y_train, x_test, y_test)
'''
dfs = []
for ticker in TICKERS:
df = yf.download(ticker).reset_index()
df = get_lagged_returns(df)
df = get_classification(df)
# We may end up with some divisions by 0 when calculating the returns
# so to prevent any rows with this slipping in, we replace any infs
# with nan values and remove all rows with nan values in them
dfs.append(
df
.replace([np.inf, -np.inf], np.nan)
.dropna()
[[col for col in df.columns if 'feat_' in col] + ['classification']]
)
nn_values = pd.concat(dfs).values
x_values = apply_smoothening(nn_values[:, :-1])
y_values = nn_values[:, -1]
# Shuffle the values to ensure the NN does not learn an order
np.random.shuffle(nn_values)
# Split into training and test data
split_idx = int(TRAIN_SPLIT*nn_values.shape[0])
return (
x_values[0:split_idx, :], # x_train
y_values[0:split_idx:], # y_train
x_values[split_idx:, :], # x_test
y_values[split_idx:], # y_test
)
def get_model(x_train: np.array) -> Sequential:
'''
Generate the NN model that we are going to train
'''
return Sequential([
Dense(128, input_shape = (x_train.shape[1], )),
Dense(64),
Dense(64),
Dense(1, activation = 'sigmoid'),
])
def evaluate_training(
model: Sequential,
x_test: np.array,
y_test: np.array
):
'''
Produce confusion matrices to evaluate the training on the testing data.
'''
score = model.evaluate(
x_test,
y_test,
verbose = 0,
)
print("Test loss:", score[0])
print("Test accuracy:", score[1])
pred = model.predict(x_test)
pred[pred >= 0.5] = 1
pred[pred < 0.5] = 0
cm = confusion_matrix(
y_true = y_test,
y_pred = pred,
)
# The scaled confusion matrix gives a view where each column is scaled
# by the total sum of elements in that column
cm_scaled = cm/cm.astype(np.float).sum(axis = 0)
unscaled = ConfusionMatrixDisplay(confusion_matrix = cm)
unscaled.plot()
unscaled.ax_.set_title('Unscaled confusion matrix')
scaled = ConfusionMatrixDisplay(confusion_matrix = cm_scaled)
scaled.plot()
scaled.ax_.set_title('Scaled confusion matrix')
return
if __name__ == '__main__':
x_train, y_train, x_test, y_test = get_nn_data()
model = get_model(x_train)
model.compile(
loss = 'binary_crossentropy',
optimizer = Adam(learning_rate = LEARN_RATE),
metrics = ['accuracy']
)
model.fit(
x_train,
y_train,
epochs = EPOCHS,
batch_size = BATCH_SIZE,
validation_split = VALIDATION,
)
evaluate_training(model, x_test, y_test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment