Created
April 19, 2023 13:08
-
-
Save riqbal-k/f6cb4c53425e4b0a97369503fe893738 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "f4ed2ffd", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/tmp/ipykernel_2472429/1036767244.py:10: FutureWarning: The pandas.datetime class is deprecated and will be removed from pandas in a future version. Import from datetime module instead.\n", | |
" from pandas import datetime\n" | |
] | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import matplotlib.pyplot as plt\n", | |
"import datetime as dt\n", | |
"import alpha_vantage as av\n", | |
"import statsmodels\n", | |
"from pytrends.request import TrendReq\n", | |
"from datetime import datetime\n", | |
"from alpha_vantage.timeseries import TimeSeries\n", | |
"from pandas import datetime\n", | |
"import math, time\n", | |
"import itertools\n", | |
"import datetime\n", | |
"from operator import itemgetter\n", | |
"from sklearn.metrics import mean_squared_error\n", | |
"from sklearn.preprocessing import MinMaxScaler\n", | |
"from math import sqrt\n", | |
"import torch\n", | |
"import os" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "8f2bca13", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.model_selection import cross_val_score\n", | |
"from sklearn.model_selection import RepeatedKFold\n", | |
"from sklearn.linear_model import Lasso\n", | |
"from sklearn.preprocessing import StandardScaler\n", | |
"import statsmodels.api as sm\n", | |
"from statsmodels.api import OLS\n", | |
"from sklearn.linear_model import LinearRegression\n", | |
"from sklearn.linear_model import LassoCV\n", | |
"from sklearn.metrics import r2_score" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "7a285b32", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'/home/mnf13'" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"os.getcwd()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "d42bc0a4", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Importing the dataset\n", | |
"data = pd.read_csv('ftse100.csv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "f5a7265a", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>#RIC</th>\n", | |
" <th>Domain</th>\n", | |
" <th>Date-Time</th>\n", | |
" <th>Type</th>\n", | |
" <th>Last</th>\n", | |
" <th>Volume</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-03-14T15:37:00.000000000Z</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>299.5</td>\n", | |
" <td>25000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-03-20T14:51:00.000000000Z</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>20.0</td>\n", | |
" <td>100000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-05-14T13:49:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>6000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-10-16T16:04:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>25.0</td>\n", | |
" <td>50000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-10-23T14:59:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>23000.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" #RIC Domain Date-Time Type \\\n", | |
"0 AAF.L Market Price 1996-03-14T15:37:00.000000000Z Intraday 1Min \n", | |
"1 AAF.L Market Price 1996-03-20T14:51:00.000000000Z Intraday 1Min \n", | |
"2 AAF.L Market Price 1996-05-14T13:49:00.000000000+01 Intraday 1Min \n", | |
"3 AAF.L Market Price 1996-10-16T16:04:00.000000000+01 Intraday 1Min \n", | |
"4 AAF.L Market Price 1996-10-23T14:59:00.000000000+01 Intraday 1Min \n", | |
"\n", | |
" Last Volume \n", | |
"0 299.5 25000.0 \n", | |
"1 20.0 100000.0 \n", | |
"2 30.0 6000.0 \n", | |
"3 25.0 50000.0 \n", | |
"4 30.0 23000.0 " | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "56bf44be", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>#RIC</th>\n", | |
" <th>Domain</th>\n", | |
" <th>Date-Time</th>\n", | |
" <th>Type</th>\n", | |
" <th>Last</th>\n", | |
" <th>Volume</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>1048570</th>\n", | |
" <td>AAL.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>2009-04-06T14:21:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>1332.0</td>\n", | |
" <td>16166.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1048571</th>\n", | |
" <td>AAL.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>2009-04-06T14:22:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>1330.0</td>\n", | |
" <td>19021.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1048572</th>\n", | |
" <td>AAL.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>2009-04-06T14:23:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>1332.0</td>\n", | |
" <td>45759.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1048573</th>\n", | |
" <td>AAL.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>2009-04-06T14:24:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>1331.0</td>\n", | |
" <td>13481.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1048574</th>\n", | |
" <td>AAL.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>2009-04-06T14:25:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>1333.0</td>\n", | |
" <td>16301.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" #RIC Domain Date-Time Type \\\n", | |
"1048570 AAL.L Market Price 2009-04-06T14:21:00.000000000+01 Intraday 1Min \n", | |
"1048571 AAL.L Market Price 2009-04-06T14:22:00.000000000+01 Intraday 1Min \n", | |
"1048572 AAL.L Market Price 2009-04-06T14:23:00.000000000+01 Intraday 1Min \n", | |
"1048573 AAL.L Market Price 2009-04-06T14:24:00.000000000+01 Intraday 1Min \n", | |
"1048574 AAL.L Market Price 2009-04-06T14:25:00.000000000+01 Intraday 1Min \n", | |
"\n", | |
" Last Volume \n", | |
"1048570 1332.0 16166.0 \n", | |
"1048571 1330.0 19021.0 \n", | |
"1048572 1332.0 45759.0 \n", | |
"1048573 1331.0 13481.0 \n", | |
"1048574 1333.0 16301.0 " | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data.tail()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "f510c6fc", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Calculate the daily returns\n", | |
"data['cobit_returns'] = data['Last'].pct_change()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "78e55916", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>#RIC</th>\n", | |
" <th>Domain</th>\n", | |
" <th>Date-Time</th>\n", | |
" <th>Type</th>\n", | |
" <th>Last</th>\n", | |
" <th>Volume</th>\n", | |
" <th>cobit_returns</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-03-14T15:37:00.000000000Z</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>299.5</td>\n", | |
" <td>25000.0</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-03-20T14:51:00.000000000Z</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>20.0</td>\n", | |
" <td>100000.0</td>\n", | |
" <td>-0.933222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-05-14T13:49:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>6000.0</td>\n", | |
" <td>0.500000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-10-16T16:04:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>25.0</td>\n", | |
" <td>50000.0</td>\n", | |
" <td>-0.166667</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-10-23T14:59:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>23000.0</td>\n", | |
" <td>0.200000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" #RIC Domain Date-Time Type \\\n", | |
"0 AAF.L Market Price 1996-03-14T15:37:00.000000000Z Intraday 1Min \n", | |
"1 AAF.L Market Price 1996-03-20T14:51:00.000000000Z Intraday 1Min \n", | |
"2 AAF.L Market Price 1996-05-14T13:49:00.000000000+01 Intraday 1Min \n", | |
"3 AAF.L Market Price 1996-10-16T16:04:00.000000000+01 Intraday 1Min \n", | |
"4 AAF.L Market Price 1996-10-23T14:59:00.000000000+01 Intraday 1Min \n", | |
"\n", | |
" Last Volume cobit_returns \n", | |
"0 299.5 25000.0 NaN \n", | |
"1 20.0 100000.0 -0.933222 \n", | |
"2 30.0 6000.0 0.500000 \n", | |
"3 25.0 50000.0 -0.166667 \n", | |
"4 30.0 23000.0 0.200000 " | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "3fa2604f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data['cobit_lag1'] = data['cobit_returns'].shift(1)\n", | |
"data['cobit_lag2'] = data['cobit_returns'].shift(2)\n", | |
"data['cobit_lag3'] = data['cobit_returns'].shift(3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "d8662ebd", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data = data[[\"cobit_returns\",\"cobit_lag1\",\"cobit_lag2\",\"cobit_lag3\"]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "66825e77", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"cobit_returns 1\n", | |
"cobit_lag1 2\n", | |
"cobit_lag2 3\n", | |
"cobit_lag3 4\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Check for invalid values\n", | |
"data.isnull().sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "fd77cda7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data = data.fillna(method='ffill').fillna(method='bfill')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "08398678", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"cobit_returns 0\n", | |
"cobit_lag1 0\n", | |
"cobit_lag2 0\n", | |
"cobit_lag3 0\n", | |
"dtype: int64\n" | |
] | |
} | |
], | |
"source": [ | |
"# Check for invalid values\n", | |
"print(np.isnan(data).sum())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "189136b6", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"786431" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"##Split data into train and test\n", | |
"# Total dataset length\n", | |
"dataset_length = data.shape[0]\n", | |
"# Training dataset length\n", | |
"split = int(dataset_length * 0.75)\n", | |
"split" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "f380a52f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(786431, 4)\n", | |
"(262144, 4)\n" | |
] | |
} | |
], | |
"source": [ | |
"# Splitiing the X and y into train and test datasets\n", | |
"train = data[:split]\n", | |
"test = data[split:]\n", | |
"# Print the size of the train and test dataset\n", | |
"print(train.shape)\n", | |
"print(test.shape)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "22d95e15", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#create x and y variables for both test and train data\n", | |
"x_train = train.drop('cobit_returns', axis=1)\n", | |
"y_train = train['cobit_returns']\n", | |
"x_test = test.drop('cobit_returns', axis=1)\n", | |
"y_test = test['cobit_returns']" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "b509bc72", | |
"metadata": {}, | |
"source": [ | |
"# Lasso-Lasso-Out of smapkle predictions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "dac9974d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Lasso on train data\n", | |
"alpha = 0.1\n", | |
"lasso = Lasso(alpha=alpha)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "44305077", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.6847539398318078\n", | |
"0.007022121544042248\n" | |
] | |
} | |
], | |
"source": [ | |
"#prediction on test data\n", | |
"#check r2 on train data\n", | |
"model_lasso = Lasso(alpha=0.01)\n", | |
"\n", | |
"model_lasso.fit(x_test, y_test) \n", | |
"\n", | |
"pred_test_lasso= model_lasso.predict(x_test)\n", | |
"\n", | |
"print(np.sqrt(mean_squared_error(y_test,pred_test_lasso)))\n", | |
"\n", | |
"print(r2_score(y_test, pred_test_lasso))\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c61c1b32", | |
"metadata": {}, | |
"source": [ | |
"# Benchmark-Lasso-Out of smaple predictions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"id": "6a9cb29c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Define the time window and lag parameters\n", | |
"est_window = 30 # minutes\n", | |
"pred_window = 3 # minutes\n", | |
"lags = 3 # number of lags to include in OLS regression" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"id": "b59937a5", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Create the OLS predictors matrix for the training data\n", | |
"ols_predictors = []\n", | |
"for i in range(lags):\n", | |
" ols_predictors.append(train['cobit_returns'].shift(i+1))\n", | |
"ols_predictors = pd.concat(ols_predictors, axis=1)\n", | |
"ols_predictors = ols_predictors.dropna()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"id": "0d6867b4", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>cobit_returns</th>\n", | |
" <th>cobit_returns</th>\n", | |
" <th>cobit_returns</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>0.500000</td>\n", | |
" <td>-0.933222</td>\n", | |
" <td>-0.933222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>-0.166667</td>\n", | |
" <td>0.500000</td>\n", | |
" <td>-0.933222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>0.200000</td>\n", | |
" <td>-0.166667</td>\n", | |
" <td>0.500000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>-0.100000</td>\n", | |
" <td>0.200000</td>\n", | |
" <td>-0.166667</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>0.018519</td>\n", | |
" <td>-0.100000</td>\n", | |
" <td>0.200000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>786426</th>\n", | |
" <td>0.002088</td>\n", | |
" <td>-0.001584</td>\n", | |
" <td>0.000334</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>786427</th>\n", | |
" <td>-0.000417</td>\n", | |
" <td>0.002088</td>\n", | |
" <td>-0.001584</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>786428</th>\n", | |
" <td>-0.000417</td>\n", | |
" <td>-0.000417</td>\n", | |
" <td>0.002088</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>786429</th>\n", | |
" <td>-0.000417</td>\n", | |
" <td>-0.000417</td>\n", | |
" <td>-0.000417</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>786430</th>\n", | |
" <td>-0.002503</td>\n", | |
" <td>-0.000417</td>\n", | |
" <td>-0.000417</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>786428 rows × 3 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" cobit_returns cobit_returns cobit_returns\n", | |
"3 0.500000 -0.933222 -0.933222\n", | |
"4 -0.166667 0.500000 -0.933222\n", | |
"5 0.200000 -0.166667 0.500000\n", | |
"6 -0.100000 0.200000 -0.166667\n", | |
"7 0.018519 -0.100000 0.200000\n", | |
"... ... ... ...\n", | |
"786426 0.002088 -0.001584 0.000334\n", | |
"786427 -0.000417 0.002088 -0.001584\n", | |
"786428 -0.000417 -0.000417 0.002088\n", | |
"786429 -0.000417 -0.000417 -0.000417\n", | |
"786430 -0.002503 -0.000417 -0.000417\n", | |
"\n", | |
"[786428 rows x 3 columns]" | |
] | |
}, | |
"execution_count": 45, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ols_predictors" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 70, | |
"id": "71dfdbe5", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Create the OLS response vector for the training data\n", | |
"ols_response = train['cobit_returns'].shift(-3)\n", | |
"ols_response = ols_response.dropna()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 71, | |
"id": "5e7c471b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"LinearRegression()" | |
] | |
}, | |
"execution_count": 71, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Fit the OLS regression model\n", | |
"ols_model = LinearRegression()\n", | |
"ols_model.fit(ols_predictors, ols_response)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 72, | |
"id": "5f62ed0c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Create the LASSO predictors matrix for the prediction data\n", | |
"lasso_predictors = []\n", | |
"for i in range(lags):\n", | |
" lasso_predictors.append(test['cobit_returns'].shift(i+1))\n", | |
"lasso_predictors = pd.concat(lasso_predictors, axis=1)\n", | |
"lasso_predictors = lasso_predictors.dropna()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 52, | |
"id": "6e076fce", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>cobit_returns</th>\n", | |
" <th>cobit_returns</th>\n", | |
" <th>cobit_returns</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>786434</th>\n", | |
" <td>-0.000418</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000836</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>786435</th>\n", | |
" <td>0.000000</td>\n", | |
" <td>-0.000418</td>\n", | |
" <td>0.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>786436</th>\n", | |
" <td>0.000318</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>-0.000418</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>786437</th>\n", | |
" <td>0.000936</td>\n", | |
" <td>0.000318</td>\n", | |
" <td>0.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>786438</th>\n", | |
" <td>-0.001253</td>\n", | |
" <td>0.000936</td>\n", | |
" <td>0.000318</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1048570</th>\n", | |
" <td>-0.004478</td>\n", | |
" <td>-0.002976</td>\n", | |
" <td>0.005988</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1048571</th>\n", | |
" <td>-0.001499</td>\n", | |
" <td>-0.004478</td>\n", | |
" <td>-0.002976</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1048572</th>\n", | |
" <td>-0.001502</td>\n", | |
" <td>-0.001499</td>\n", | |
" <td>-0.004478</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1048573</th>\n", | |
" <td>0.001504</td>\n", | |
" <td>-0.001502</td>\n", | |
" <td>-0.001499</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1048574</th>\n", | |
" <td>-0.000751</td>\n", | |
" <td>0.001504</td>\n", | |
" <td>-0.001502</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>262141 rows × 3 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" cobit_returns cobit_returns cobit_returns\n", | |
"786434 -0.000418 0.000000 0.000836\n", | |
"786435 0.000000 -0.000418 0.000000\n", | |
"786436 0.000318 0.000000 -0.000418\n", | |
"786437 0.000936 0.000318 0.000000\n", | |
"786438 -0.001253 0.000936 0.000318\n", | |
"... ... ... ...\n", | |
"1048570 -0.004478 -0.002976 0.005988\n", | |
"1048571 -0.001499 -0.004478 -0.002976\n", | |
"1048572 -0.001502 -0.001499 -0.004478\n", | |
"1048573 0.001504 -0.001502 -0.001499\n", | |
"1048574 -0.000751 0.001504 -0.001502\n", | |
"\n", | |
"[262141 rows x 3 columns]" | |
] | |
}, | |
"execution_count": 52, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"lasso_predictors" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 73, | |
"id": "5ba67670", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Apply the OLS coefficients to the LASSO predictors matrix\n", | |
"lasso_predictors = lasso_predictors.mul(ols_model.coef_, axis=1).sum(axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"id": "2121007f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"786434 -1.460073e-09\n", | |
"786435 1.728927e-09\n", | |
"786436 2.079490e-10\n", | |
"786437 -5.795036e-09\n", | |
"786438 8.075894e-10\n", | |
" ... \n", | |
"1048570 8.973827e-09\n", | |
"1048571 3.801197e-08\n", | |
"1048572 3.191100e-08\n", | |
"1048573 5.216834e-09\n", | |
"1048574 3.582629e-09\n", | |
"Length: 262141, dtype: float64" | |
] | |
}, | |
"execution_count": 54, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"lasso_predictors" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"id": "8b33b0cf", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Normalize the LASSO predictors to have mean zero and unit variance\n", | |
"lasso_predictors = (lasso_predictors - lasso_predictors.mean()) / lasso_predictors.std()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 59, | |
"id": "5eeb2c90", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Create the LASSO response vector for the prediction data\n", | |
"lasso_response = test['cobit_returns'].shift(-3)\n", | |
"lasso_response = lasso_response.dropna()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"id": "f69cf6de", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"786431 0.000000\n", | |
"786432 -0.000418\n", | |
"786433 0.000000\n", | |
"786434 0.000318\n", | |
"786435 0.000936\n", | |
" ... \n", | |
"1048569 -0.001499\n", | |
"1048570 -0.001502\n", | |
"1048571 0.001504\n", | |
"1048572 -0.000751\n", | |
"1048573 0.001503\n", | |
"Name: cobit_returns, Length: 262143, dtype: float64" | |
] | |
}, | |
"execution_count": 57, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"lasso_response" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 60, | |
"id": "2f1db3b1", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"LassoCV(cv=5)" | |
] | |
}, | |
"execution_count": 60, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Fit the LASSO regression model\n", | |
"lasso_model = LassoCV(cv=5)\n", | |
"lasso_model.fit(lasso_predictors.values.reshape(-1, 1), lasso_response)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 74, | |
"id": "0729020d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Make the 1-minute-ahead return forecast for each stock\n", | |
"forecasts = []\n", | |
"for i in range(lags, len(test)):\n", | |
" lasso_predictor = test.iloc[i-lags:i]['cobit_returns'].mul(ols_model.coef_).sum()\n", | |
" lasso_predictor = (lasso_predictor - lasso_predictors.mean()) / lasso_predictors.std()\n", | |
" forecast = ols_model.intercept_ + lasso_model.predict([[lasso_predictor]])\n", | |
" forecasts.append(forecast[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 76, | |
"id": "043df139", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Out-of-sample R-squared: -86.9036\n" | |
] | |
} | |
], | |
"source": [ | |
"# Evaluate the out-of-sample performance\n", | |
"y_true = test['cobit_returns'].shift(-3).dropna()\n", | |
"y_pred = pd.Series(forecasts)\n", | |
"r2 = r2_score(y_true, y_pred)\n", | |
"print(f\"Out-of-sample R-squared: {r2:.4f}\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "997c70ca", | |
"metadata": {}, | |
"source": [ | |
"# Autoregressive Models" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 87, | |
"id": "43638853", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import statsmodels.api as sm\n", | |
"from sklearn.metrics import r2_score, mean_absolute_error" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 88, | |
"id": "74d6056e", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Importing the dataset\n", | |
"data = pd.read_csv('ftse100.csv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 89, | |
"id": "49ad1cde", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>#RIC</th>\n", | |
" <th>Domain</th>\n", | |
" <th>Date-Time</th>\n", | |
" <th>Type</th>\n", | |
" <th>Last</th>\n", | |
" <th>Volume</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-03-14T15:37:00.000000000Z</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>299.5</td>\n", | |
" <td>25000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-03-20T14:51:00.000000000Z</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>20.0</td>\n", | |
" <td>100000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-05-14T13:49:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>6000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-10-16T16:04:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>25.0</td>\n", | |
" <td>50000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-10-23T14:59:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>23000.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" #RIC Domain Date-Time Type \\\n", | |
"0 AAF.L Market Price 1996-03-14T15:37:00.000000000Z Intraday 1Min \n", | |
"1 AAF.L Market Price 1996-03-20T14:51:00.000000000Z Intraday 1Min \n", | |
"2 AAF.L Market Price 1996-05-14T13:49:00.000000000+01 Intraday 1Min \n", | |
"3 AAF.L Market Price 1996-10-16T16:04:00.000000000+01 Intraday 1Min \n", | |
"4 AAF.L Market Price 1996-10-23T14:59:00.000000000+01 Intraday 1Min \n", | |
"\n", | |
" Last Volume \n", | |
"0 299.5 25000.0 \n", | |
"1 20.0 100000.0 \n", | |
"2 30.0 6000.0 \n", | |
"3 25.0 50000.0 \n", | |
"4 30.0 23000.0 " | |
] | |
}, | |
"execution_count": 89, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 91, | |
"id": "a4a8b2ff", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Calculate the cobit daily returns\n", | |
"data['returns'] = data['Last'].pct_change()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 92, | |
"id": "bdc4e75e", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# create lagged return variables\n", | |
"data['lag_1'] = data['returns'].shift(1)\n", | |
"data['lag_2'] = data['returns'].shift(2)\n", | |
"data['lag_3'] = data['returns'].shift(3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 93, | |
"id": "9bbcc4c0", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#fill in missing values\n", | |
"data = data.fillna(method='ffill').fillna(method='bfill')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 95, | |
"id": "e6652999", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(786431, 10)\n", | |
"(262144, 10)\n" | |
] | |
} | |
], | |
"source": [ | |
"# Splitiing the X and y into train and test datasets\n", | |
"train = data[:split]\n", | |
"test = data[split:]\n", | |
"# Print the size of the train and test dataset\n", | |
"print(train.shape)\n", | |
"print(test.shape)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 96, | |
"id": "39231b16", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# fit autoregressive model to training data\n", | |
"model = sm.OLS(train['returns'], train[['lag_1', 'lag_2', 'lag_3']])\n", | |
"results = model.fit()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 97, | |
"id": "86af4e2d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# make predictions on test data\n", | |
"predictions = results.predict(test_data[['lag_1', 'lag_2', 'lag_3']])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 98, | |
"id": "cbd014f4", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# calculate R-squared and MAE\n", | |
"r2 = r2_score(test_data['returns'], predictions)\n", | |
"mae = mean_absolute_error(test_data['returns'], predictions)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 99, | |
"id": "dc4edc09", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"R-squared: -0.00981647420755305\n", | |
"MAE: 0.0014406552381127644\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"R-squared:\", r2)\n", | |
"print(\"MAE:\", mae)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "18ee7074", | |
"metadata": {}, | |
"source": [ | |
"# AR(1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 101, | |
"id": "07958e49", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"R-squared: -0.00981647414726261\n", | |
"MAE: 0.001440655238121796\n" | |
] | |
} | |
], | |
"source": [ | |
"# fit autoregressive model to training data\n", | |
"model = sm.OLS(train['returns'], train[['lag_1']])\n", | |
"results = model.fit()\n", | |
"# make predictions on test data\n", | |
"predictions = results.predict(test_data[['lag_1']])\n", | |
"# calculate R-squared and MAE\n", | |
"r2 = r2_score(test_data['returns'], predictions)\n", | |
"mae = mean_absolute_error(test_data['returns'], predictions)\n", | |
"print(\"R-squared:\", r2)\n", | |
"print(\"MAE:\", mae)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "7e3e9e00", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment