Created
April 19, 2023 12:45
-
-
Save riqbal-k/59f32a8482e1b2739ba479dc1e051ff6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 142, | |
"id": "1f0641fc", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/tmp/ipykernel_2403777/1036767244.py:10: FutureWarning: The pandas.datetime class is deprecated and will be removed from pandas in a future version. Import from datetime module instead.\n", | |
" from pandas import datetime\n" | |
] | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import matplotlib.pyplot as plt\n", | |
"import datetime as dt\n", | |
"import alpha_vantage as av\n", | |
"import statsmodels\n", | |
"from pytrends.request import TrendReq\n", | |
"from datetime import datetime\n", | |
"from alpha_vantage.timeseries import TimeSeries\n", | |
"from pandas import datetime\n", | |
"import math, time\n", | |
"import itertools\n", | |
"import datetime\n", | |
"from operator import itemgetter\n", | |
"from sklearn.metrics import mean_squared_error\n", | |
"from sklearn.preprocessing import MinMaxScaler\n", | |
"from math import sqrt\n", | |
"import torch\n", | |
"import os" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "2fb69b17", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Import the os module\n", | |
"import os" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "3a886f2b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.model_selection import cross_val_score\n", | |
"from sklearn.model_selection import RepeatedKFold\n", | |
"from sklearn.linear_model import Lasso" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "ececf4bc", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.preprocessing import StandardScaler" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"id": "0c585520", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import statsmodels.api as sm" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "6218f844", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'/home/mnf13'" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"os.getcwd()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 159, | |
"id": "f6043974", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Importing the dataset\n", | |
"data = pd.read_csv('ftse100.csv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 72, | |
"id": "936c05ae", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>#RIC</th>\n", | |
" <th>Domain</th>\n", | |
" <th>Date-Time</th>\n", | |
" <th>Type</th>\n", | |
" <th>Last</th>\n", | |
" <th>Volume</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-03-14T15:37:00.000000000Z</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>299.5</td>\n", | |
" <td>25000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-03-20T14:51:00.000000000Z</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>20.0</td>\n", | |
" <td>100000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-05-14T13:49:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>6000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-10-16T16:04:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>25.0</td>\n", | |
" <td>50000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-10-23T14:59:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>23000.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" #RIC Domain Date-Time Type \\\n", | |
"0 AAF.L Market Price 1996-03-14T15:37:00.000000000Z Intraday 1Min \n", | |
"1 AAF.L Market Price 1996-03-20T14:51:00.000000000Z Intraday 1Min \n", | |
"2 AAF.L Market Price 1996-05-14T13:49:00.000000000+01 Intraday 1Min \n", | |
"3 AAF.L Market Price 1996-10-16T16:04:00.000000000+01 Intraday 1Min \n", | |
"4 AAF.L Market Price 1996-10-23T14:59:00.000000000+01 Intraday 1Min \n", | |
"\n", | |
" Last Volume \n", | |
"0 299.5 25000.0 \n", | |
"1 20.0 100000.0 \n", | |
"2 30.0 6000.0 \n", | |
"3 25.0 50000.0 \n", | |
"4 30.0 23000.0 " | |
] | |
}, | |
"execution_count": 72, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 164, | |
"id": "19db7407", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# convert Date-Time to datetime and set as index\n", | |
"data['Date-Time'] = pd.to_datetime(data['Date-Time'], format='%Y-%m-%d %H:%M:%S')\n", | |
"data.set_index('Date-Time', inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 160, | |
"id": "83726341", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>#RIC</th>\n", | |
" <th>Domain</th>\n", | |
" <th>Date-Time</th>\n", | |
" <th>Type</th>\n", | |
" <th>Last</th>\n", | |
" <th>Volume</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-03-14T15:37:00.000000000Z</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>299.5</td>\n", | |
" <td>25000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-03-20T14:51:00.000000000Z</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>20.0</td>\n", | |
" <td>100000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-05-14T13:49:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>6000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-10-16T16:04:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>25.0</td>\n", | |
" <td>50000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-10-23T14:59:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>23000.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" #RIC Domain Date-Time Type \\\n", | |
"0 AAF.L Market Price 1996-03-14T15:37:00.000000000Z Intraday 1Min \n", | |
"1 AAF.L Market Price 1996-03-20T14:51:00.000000000Z Intraday 1Min \n", | |
"2 AAF.L Market Price 1996-05-14T13:49:00.000000000+01 Intraday 1Min \n", | |
"3 AAF.L Market Price 1996-10-16T16:04:00.000000000+01 Intraday 1Min \n", | |
"4 AAF.L Market Price 1996-10-23T14:59:00.000000000+01 Intraday 1Min \n", | |
"\n", | |
" Last Volume \n", | |
"0 299.5 25000.0 \n", | |
"1 20.0 100000.0 \n", | |
"2 30.0 6000.0 \n", | |
"3 25.0 50000.0 \n", | |
"4 30.0 23000.0 " | |
] | |
}, | |
"execution_count": 160, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 161, | |
"id": "54370b6d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Calculate the daily returns\n", | |
"data['cobit_returns'] = data['Last'].pct_change()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 162, | |
"id": "25c13cc3", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>#RIC</th>\n", | |
" <th>Domain</th>\n", | |
" <th>Date-Time</th>\n", | |
" <th>Type</th>\n", | |
" <th>Last</th>\n", | |
" <th>Volume</th>\n", | |
" <th>cobit_returns</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-03-14T15:37:00.000000000Z</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>299.5</td>\n", | |
" <td>25000.0</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-03-20T14:51:00.000000000Z</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>20.0</td>\n", | |
" <td>100000.0</td>\n", | |
" <td>-0.933222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-05-14T13:49:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>6000.0</td>\n", | |
" <td>0.500000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-10-16T16:04:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>25.0</td>\n", | |
" <td>50000.0</td>\n", | |
" <td>-0.166667</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-10-23T14:59:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>23000.0</td>\n", | |
" <td>0.200000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1048570</th>\n", | |
" <td>AAL.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>2009-04-06T14:21:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>1332.0</td>\n", | |
" <td>16166.0</td>\n", | |
" <td>-0.001499</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1048571</th>\n", | |
" <td>AAL.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>2009-04-06T14:22:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>1330.0</td>\n", | |
" <td>19021.0</td>\n", | |
" <td>-0.001502</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1048572</th>\n", | |
" <td>AAL.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>2009-04-06T14:23:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>1332.0</td>\n", | |
" <td>45759.0</td>\n", | |
" <td>0.001504</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1048573</th>\n", | |
" <td>AAL.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>2009-04-06T14:24:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>1331.0</td>\n", | |
" <td>13481.0</td>\n", | |
" <td>-0.000751</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1048574</th>\n", | |
" <td>AAL.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>2009-04-06T14:25:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>1333.0</td>\n", | |
" <td>16301.0</td>\n", | |
" <td>0.001503</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>1048575 rows × 7 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" #RIC Domain Date-Time Type \\\n", | |
"0 AAF.L Market Price 1996-03-14T15:37:00.000000000Z Intraday 1Min \n", | |
"1 AAF.L Market Price 1996-03-20T14:51:00.000000000Z Intraday 1Min \n", | |
"2 AAF.L Market Price 1996-05-14T13:49:00.000000000+01 Intraday 1Min \n", | |
"3 AAF.L Market Price 1996-10-16T16:04:00.000000000+01 Intraday 1Min \n", | |
"4 AAF.L Market Price 1996-10-23T14:59:00.000000000+01 Intraday 1Min \n", | |
"... ... ... ... ... \n", | |
"1048570 AAL.L Market Price 2009-04-06T14:21:00.000000000+01 Intraday 1Min \n", | |
"1048571 AAL.L Market Price 2009-04-06T14:22:00.000000000+01 Intraday 1Min \n", | |
"1048572 AAL.L Market Price 2009-04-06T14:23:00.000000000+01 Intraday 1Min \n", | |
"1048573 AAL.L Market Price 2009-04-06T14:24:00.000000000+01 Intraday 1Min \n", | |
"1048574 AAL.L Market Price 2009-04-06T14:25:00.000000000+01 Intraday 1Min \n", | |
"\n", | |
" Last Volume cobit_returns \n", | |
"0 299.5 25000.0 NaN \n", | |
"1 20.0 100000.0 -0.933222 \n", | |
"2 30.0 6000.0 0.500000 \n", | |
"3 25.0 50000.0 -0.166667 \n", | |
"4 30.0 23000.0 0.200000 \n", | |
"... ... ... ... \n", | |
"1048570 1332.0 16166.0 -0.001499 \n", | |
"1048571 1330.0 19021.0 -0.001502 \n", | |
"1048572 1332.0 45759.0 0.001504 \n", | |
"1048573 1331.0 13481.0 -0.000751 \n", | |
"1048574 1333.0 16301.0 0.001503 \n", | |
"\n", | |
"[1048575 rows x 7 columns]" | |
] | |
}, | |
"execution_count": 162, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "bb16d1be", | |
"metadata": {}, | |
"source": [ | |
"# Linear Regression for one mintue a head predictions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 166, | |
"id": "b7c4b06f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Create lagged versions of the COBIT returns\n", | |
"data['cobit_lag1'] = data['cobit_returns'].shift(1)\n", | |
"data['cobit_lag2'] = data['cobit_returns'].shift(2)\n", | |
"data['cobit_lag3'] = data['cobit_returns'].shift(3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 167, | |
"id": "d5882034", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>#RIC</th>\n", | |
" <th>Domain</th>\n", | |
" <th>Type</th>\n", | |
" <th>Last</th>\n", | |
" <th>Volume</th>\n", | |
" <th>cobit_returns</th>\n", | |
" <th>cobit_lag1</th>\n", | |
" <th>cobit_lag2</th>\n", | |
" <th>cobit_lag3</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Date-Time</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>1996-03-14 15:37:00+00:00</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>299.5</td>\n", | |
" <td>25000.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1996-03-20 14:51:00+00:00</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>20.0</td>\n", | |
" <td>100000.0</td>\n", | |
" <td>-0.933222</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1996-05-14 13:49:00+01:00</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>6000.0</td>\n", | |
" <td>0.500000</td>\n", | |
" <td>-0.933222</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1996-10-16 16:04:00+01:00</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>25.0</td>\n", | |
" <td>50000.0</td>\n", | |
" <td>-0.166667</td>\n", | |
" <td>0.500000</td>\n", | |
" <td>-0.933222</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1996-10-23 14:59:00+01:00</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>23000.0</td>\n", | |
" <td>0.200000</td>\n", | |
" <td>-0.166667</td>\n", | |
" <td>0.500000</td>\n", | |
" <td>-0.933222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2009-04-06 14:21:00+01:00</th>\n", | |
" <td>AAL.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>1332.0</td>\n", | |
" <td>16166.0</td>\n", | |
" <td>-0.001499</td>\n", | |
" <td>-0.004478</td>\n", | |
" <td>-0.002976</td>\n", | |
" <td>0.005988</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2009-04-06 14:22:00+01:00</th>\n", | |
" <td>AAL.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>1330.0</td>\n", | |
" <td>19021.0</td>\n", | |
" <td>-0.001502</td>\n", | |
" <td>-0.001499</td>\n", | |
" <td>-0.004478</td>\n", | |
" <td>-0.002976</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2009-04-06 14:23:00+01:00</th>\n", | |
" <td>AAL.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>1332.0</td>\n", | |
" <td>45759.0</td>\n", | |
" <td>0.001504</td>\n", | |
" <td>-0.001502</td>\n", | |
" <td>-0.001499</td>\n", | |
" <td>-0.004478</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2009-04-06 14:24:00+01:00</th>\n", | |
" <td>AAL.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>1331.0</td>\n", | |
" <td>13481.0</td>\n", | |
" <td>-0.000751</td>\n", | |
" <td>0.001504</td>\n", | |
" <td>-0.001502</td>\n", | |
" <td>-0.001499</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2009-04-06 14:25:00+01:00</th>\n", | |
" <td>AAL.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>1333.0</td>\n", | |
" <td>16301.0</td>\n", | |
" <td>0.001503</td>\n", | |
" <td>-0.000751</td>\n", | |
" <td>0.001504</td>\n", | |
" <td>-0.001502</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>1048575 rows × 9 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" #RIC Domain Type Last \\\n", | |
"Date-Time \n", | |
"1996-03-14 15:37:00+00:00 AAF.L Market Price Intraday 1Min 299.5 \n", | |
"1996-03-20 14:51:00+00:00 AAF.L Market Price Intraday 1Min 20.0 \n", | |
"1996-05-14 13:49:00+01:00 AAF.L Market Price Intraday 1Min 30.0 \n", | |
"1996-10-16 16:04:00+01:00 AAF.L Market Price Intraday 1Min 25.0 \n", | |
"1996-10-23 14:59:00+01:00 AAF.L Market Price Intraday 1Min 30.0 \n", | |
"... ... ... ... ... \n", | |
"2009-04-06 14:21:00+01:00 AAL.L Market Price Intraday 1Min 1332.0 \n", | |
"2009-04-06 14:22:00+01:00 AAL.L Market Price Intraday 1Min 1330.0 \n", | |
"2009-04-06 14:23:00+01:00 AAL.L Market Price Intraday 1Min 1332.0 \n", | |
"2009-04-06 14:24:00+01:00 AAL.L Market Price Intraday 1Min 1331.0 \n", | |
"2009-04-06 14:25:00+01:00 AAL.L Market Price Intraday 1Min 1333.0 \n", | |
"\n", | |
" Volume cobit_returns cobit_lag1 cobit_lag2 \\\n", | |
"Date-Time \n", | |
"1996-03-14 15:37:00+00:00 25000.0 NaN NaN NaN \n", | |
"1996-03-20 14:51:00+00:00 100000.0 -0.933222 NaN NaN \n", | |
"1996-05-14 13:49:00+01:00 6000.0 0.500000 -0.933222 NaN \n", | |
"1996-10-16 16:04:00+01:00 50000.0 -0.166667 0.500000 -0.933222 \n", | |
"1996-10-23 14:59:00+01:00 23000.0 0.200000 -0.166667 0.500000 \n", | |
"... ... ... ... ... \n", | |
"2009-04-06 14:21:00+01:00 16166.0 -0.001499 -0.004478 -0.002976 \n", | |
"2009-04-06 14:22:00+01:00 19021.0 -0.001502 -0.001499 -0.004478 \n", | |
"2009-04-06 14:23:00+01:00 45759.0 0.001504 -0.001502 -0.001499 \n", | |
"2009-04-06 14:24:00+01:00 13481.0 -0.000751 0.001504 -0.001502 \n", | |
"2009-04-06 14:25:00+01:00 16301.0 0.001503 -0.000751 0.001504 \n", | |
"\n", | |
" cobit_lag3 \n", | |
"Date-Time \n", | |
"1996-03-14 15:37:00+00:00 NaN \n", | |
"1996-03-20 14:51:00+00:00 NaN \n", | |
"1996-05-14 13:49:00+01:00 NaN \n", | |
"1996-10-16 16:04:00+01:00 NaN \n", | |
"1996-10-23 14:59:00+01:00 -0.933222 \n", | |
"... ... \n", | |
"2009-04-06 14:21:00+01:00 0.005988 \n", | |
"2009-04-06 14:22:00+01:00 -0.002976 \n", | |
"2009-04-06 14:23:00+01:00 -0.004478 \n", | |
"2009-04-06 14:24:00+01:00 -0.001499 \n", | |
"2009-04-06 14:25:00+01:00 -0.001502 \n", | |
"\n", | |
"[1048575 rows x 9 columns]" | |
] | |
}, | |
"execution_count": 167, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 99, | |
"id": "dcc3863c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data = data[[\"cobit_returns\",\"cobit_lag1\",\"cobit_lag2\",\"cobit_lag3\"]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 108, | |
"id": "b1f44743", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>cobit_returns</th>\n", | |
" <th>cobit_lag1</th>\n", | |
" <th>cobit_lag2</th>\n", | |
" <th>cobit_lag3</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Date-Time</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>1996-03-14 15:37:00+00:00</th>\n", | |
" <td>-0.933222</td>\n", | |
" <td>-0.933222</td>\n", | |
" <td>-0.933222</td>\n", | |
" <td>-0.933222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1996-03-20 14:51:00+00:00</th>\n", | |
" <td>-0.933222</td>\n", | |
" <td>-0.933222</td>\n", | |
" <td>-0.933222</td>\n", | |
" <td>-0.933222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1996-05-14 13:49:00+01:00</th>\n", | |
" <td>0.500000</td>\n", | |
" <td>-0.933222</td>\n", | |
" <td>-0.933222</td>\n", | |
" <td>-0.933222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1996-10-16 16:04:00+01:00</th>\n", | |
" <td>-0.166667</td>\n", | |
" <td>0.500000</td>\n", | |
" <td>-0.933222</td>\n", | |
" <td>-0.933222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1996-10-23 14:59:00+01:00</th>\n", | |
" <td>0.200000</td>\n", | |
" <td>-0.166667</td>\n", | |
" <td>0.500000</td>\n", | |
" <td>-0.933222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2009-04-06 14:21:00+01:00</th>\n", | |
" <td>-0.001499</td>\n", | |
" <td>-0.004478</td>\n", | |
" <td>-0.002976</td>\n", | |
" <td>0.005988</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2009-04-06 14:22:00+01:00</th>\n", | |
" <td>-0.001502</td>\n", | |
" <td>-0.001499</td>\n", | |
" <td>-0.004478</td>\n", | |
" <td>-0.002976</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2009-04-06 14:23:00+01:00</th>\n", | |
" <td>0.001504</td>\n", | |
" <td>-0.001502</td>\n", | |
" <td>-0.001499</td>\n", | |
" <td>-0.004478</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2009-04-06 14:24:00+01:00</th>\n", | |
" <td>-0.000751</td>\n", | |
" <td>0.001504</td>\n", | |
" <td>-0.001502</td>\n", | |
" <td>-0.001499</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2009-04-06 14:25:00+01:00</th>\n", | |
" <td>0.001503</td>\n", | |
" <td>-0.000751</td>\n", | |
" <td>0.001504</td>\n", | |
" <td>-0.001502</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>1048575 rows × 4 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" cobit_returns cobit_lag1 cobit_lag2 cobit_lag3\n", | |
"Date-Time \n", | |
"1996-03-14 15:37:00+00:00 -0.933222 -0.933222 -0.933222 -0.933222\n", | |
"1996-03-20 14:51:00+00:00 -0.933222 -0.933222 -0.933222 -0.933222\n", | |
"1996-05-14 13:49:00+01:00 0.500000 -0.933222 -0.933222 -0.933222\n", | |
"1996-10-16 16:04:00+01:00 -0.166667 0.500000 -0.933222 -0.933222\n", | |
"1996-10-23 14:59:00+01:00 0.200000 -0.166667 0.500000 -0.933222\n", | |
"... ... ... ... ...\n", | |
"2009-04-06 14:21:00+01:00 -0.001499 -0.004478 -0.002976 0.005988\n", | |
"2009-04-06 14:22:00+01:00 -0.001502 -0.001499 -0.004478 -0.002976\n", | |
"2009-04-06 14:23:00+01:00 0.001504 -0.001502 -0.001499 -0.004478\n", | |
"2009-04-06 14:24:00+01:00 -0.000751 0.001504 -0.001502 -0.001499\n", | |
"2009-04-06 14:25:00+01:00 0.001503 -0.000751 0.001504 -0.001502\n", | |
"\n", | |
"[1048575 rows x 4 columns]" | |
] | |
}, | |
"execution_count": 108, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 109, | |
"id": "934a2acd", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data = pd.DataFrame(data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 110, | |
"id": "3f06a894", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>cobit_returns</th>\n", | |
" <th>cobit_lag1</th>\n", | |
" <th>cobit_lag2</th>\n", | |
" <th>cobit_lag3</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Date-Time</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>1996-03-14 15:37:00+00:00</th>\n", | |
" <td>-0.933222</td>\n", | |
" <td>-0.933222</td>\n", | |
" <td>-0.933222</td>\n", | |
" <td>-0.933222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1996-03-20 14:51:00+00:00</th>\n", | |
" <td>-0.933222</td>\n", | |
" <td>-0.933222</td>\n", | |
" <td>-0.933222</td>\n", | |
" <td>-0.933222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1996-05-14 13:49:00+01:00</th>\n", | |
" <td>0.500000</td>\n", | |
" <td>-0.933222</td>\n", | |
" <td>-0.933222</td>\n", | |
" <td>-0.933222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1996-10-16 16:04:00+01:00</th>\n", | |
" <td>-0.166667</td>\n", | |
" <td>0.500000</td>\n", | |
" <td>-0.933222</td>\n", | |
" <td>-0.933222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1996-10-23 14:59:00+01:00</th>\n", | |
" <td>0.200000</td>\n", | |
" <td>-0.166667</td>\n", | |
" <td>0.500000</td>\n", | |
" <td>-0.933222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2009-04-06 14:21:00+01:00</th>\n", | |
" <td>-0.001499</td>\n", | |
" <td>-0.004478</td>\n", | |
" <td>-0.002976</td>\n", | |
" <td>0.005988</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2009-04-06 14:22:00+01:00</th>\n", | |
" <td>-0.001502</td>\n", | |
" <td>-0.001499</td>\n", | |
" <td>-0.004478</td>\n", | |
" <td>-0.002976</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2009-04-06 14:23:00+01:00</th>\n", | |
" <td>0.001504</td>\n", | |
" <td>-0.001502</td>\n", | |
" <td>-0.001499</td>\n", | |
" <td>-0.004478</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2009-04-06 14:24:00+01:00</th>\n", | |
" <td>-0.000751</td>\n", | |
" <td>0.001504</td>\n", | |
" <td>-0.001502</td>\n", | |
" <td>-0.001499</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2009-04-06 14:25:00+01:00</th>\n", | |
" <td>0.001503</td>\n", | |
" <td>-0.000751</td>\n", | |
" <td>0.001504</td>\n", | |
" <td>-0.001502</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>1048575 rows × 4 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" cobit_returns cobit_lag1 cobit_lag2 cobit_lag3\n", | |
"Date-Time \n", | |
"1996-03-14 15:37:00+00:00 -0.933222 -0.933222 -0.933222 -0.933222\n", | |
"1996-03-20 14:51:00+00:00 -0.933222 -0.933222 -0.933222 -0.933222\n", | |
"1996-05-14 13:49:00+01:00 0.500000 -0.933222 -0.933222 -0.933222\n", | |
"1996-10-16 16:04:00+01:00 -0.166667 0.500000 -0.933222 -0.933222\n", | |
"1996-10-23 14:59:00+01:00 0.200000 -0.166667 0.500000 -0.933222\n", | |
"... ... ... ... ...\n", | |
"2009-04-06 14:21:00+01:00 -0.001499 -0.004478 -0.002976 0.005988\n", | |
"2009-04-06 14:22:00+01:00 -0.001502 -0.001499 -0.004478 -0.002976\n", | |
"2009-04-06 14:23:00+01:00 0.001504 -0.001502 -0.001499 -0.004478\n", | |
"2009-04-06 14:24:00+01:00 -0.000751 0.001504 -0.001502 -0.001499\n", | |
"2009-04-06 14:25:00+01:00 0.001503 -0.000751 0.001504 -0.001502\n", | |
"\n", | |
"[1048575 rows x 4 columns]" | |
] | |
}, | |
"execution_count": 110, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 101, | |
"id": "024fd39a", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"cobit_returns 1\n", | |
"cobit_lag1 2\n", | |
"cobit_lag2 3\n", | |
"cobit_lag3 4\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 101, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Check for invalid values\n", | |
"data.isnull().sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 102, | |
"id": "2a50cd93", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data = data.fillna(method='ffill').fillna(method='bfill')\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 103, | |
"id": "ab0f28c5", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"cobit_returns 0\n", | |
"cobit_lag1 0\n", | |
"cobit_lag2 0\n", | |
"cobit_lag3 0\n", | |
"dtype: int64\n" | |
] | |
} | |
], | |
"source": [ | |
"# Check for invalid values\n", | |
"print(np.isnan(data).sum())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 104, | |
"id": "3514c850", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"786431" | |
] | |
}, | |
"execution_count": 104, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"##Split data into train and test\n", | |
"# Total dataset length\n", | |
"dataset_length = data.shape[0]\n", | |
"# Training dataset length\n", | |
"split = int(dataset_length * 0.75)\n", | |
"split\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 106, | |
"id": "dc5e63d9", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(786431, 4)\n", | |
"(262144, 4)\n" | |
] | |
} | |
], | |
"source": [ | |
"# Splitiing the X and y into train and test datasets\n", | |
"train = data[:split]\n", | |
"test = data[split:]\n", | |
"# Print the size of the train and test dataset\n", | |
"print(train.shape)\n", | |
"print(test.shape)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 111, | |
"id": "a9f6c983", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#create x and y variables for both test and train data\n", | |
"x_train = train.drop('cobit_returns', axis=1)\n", | |
"y_train = train['cobit_returns']\n", | |
"x_valid = test.drop('cobit_returns', axis=1)\n", | |
"y_valid = test['cobit_returns']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 113, | |
"id": "a6f4cce0", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Fit the OLS model on the training data\n", | |
"model = OLS(y_train, x_train).fit()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 114, | |
"id": "851f4b1d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" OLS Regression Results \n", | |
"=======================================================================================\n", | |
"Dep. Variable: cobit_returns R-squared (uncentered): 0.000\n", | |
"Model: OLS Adj. R-squared (uncentered): -0.000\n", | |
"Method: Least Squares F-statistic: 1.103e-07\n", | |
"Date: Thu, 13 Apr 2023 Prob (F-statistic): 1.00\n", | |
"Time: 23:10:19 Log-Likelihood: -7.4506e+06\n", | |
"No. Observations: 786431 AIC: 1.490e+07\n", | |
"Df Residuals: 786428 BIC: 1.490e+07\n", | |
"Df Model: 3 \n", | |
"Covariance Type: nonrobust \n", | |
"==============================================================================\n", | |
" coef std err t P>|t| [0.025 0.975]\n", | |
"------------------------------------------------------------------------------\n", | |
"cobit_lag1 -6.485e-07 0.001 -0.001 1.000 -0.002 0.002\n", | |
"cobit_lag2 2.135e-10 0.001 1.89e-07 1.000 -0.002 0.002\n", | |
"cobit_lag3 9.858e-10 0.001 8.74e-07 1.000 -0.002 0.002\n", | |
"================================================================================\n", | |
"Omnibus: 5291579.778 Durbin-Watson: 2.000\n", | |
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 7898864025013409.000\n", | |
"Skew: 666.114 Prob(JB): 0.00\n", | |
"Kurtosis: 490974.213 Cond. No. 1.00\n", | |
"================================================================================\n", | |
"\n", | |
"Notes:\n", | |
"[1] R² is computed without centering (uncentered) since the model does not contain a constant.\n", | |
"[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" | |
] | |
} | |
], | |
"source": [ | |
"# Print a summary of the model\n", | |
"print(model.summary())" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "376b0f72", | |
"metadata": {}, | |
"source": [ | |
"# LASSO Model for predictions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 143, | |
"id": "38341102", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Importing the dataset\n", | |
"data = pd.read_csv('ftse100.csv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 144, | |
"id": "5be2e40c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# calculate the returns\n", | |
"data['return'] = data['Last'].pct_change()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 145, | |
"id": "57a55326", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data.dropna(inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 146, | |
"id": "6e3e4804", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# split data into train and test sets\n", | |
"train_size = int(len(data) * 0.8)\n", | |
"train = data.iloc[:train_size]\n", | |
"test = data.iloc[train_size:]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 147, | |
"id": "931378ec", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# create rolling window features for train set\n", | |
"window_size = 30 # 30 minutes\n", | |
"X_train = []\n", | |
"y_train = []\n", | |
"for i in range(window_size, len(train)):\n", | |
" X_train.append(train.iloc[i-window_size:i]['return'].values)\n", | |
" y_train.append(train.iloc[i]['Last'])\n", | |
"X_train, y_train = np.array(X_train), np.array(y_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 149, | |
"id": "56ad86c4", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Lasso(alpha=0.1)" | |
] | |
}, | |
"execution_count": 149, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#Create the Lasso model\n", | |
"lasso = Lasso(alpha=0.1)\n", | |
"\n", | |
"# Train the model\n", | |
"lasso.fit(X_train, y_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 150, | |
"id": "89bd1ab1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# create rolling window features for test set\n", | |
"window_size = 30 # 3 minutes\n", | |
"X_test = []\n", | |
"y_test = []\n", | |
"for i in range(window_size, len(test)):\n", | |
" X_test.append(test.iloc[i-window_size:i]['return'].values)\n", | |
" y_test.append(test.iloc[i]['Last'])\n", | |
"X_test, y_test = np.array(X_test), np.array(y_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 153, | |
"id": "305a576c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# make predictions for test set\n", | |
"y_pred = lasso.predict(X_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 152, | |
"id": "c95a0323", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# calculate the mean absolute error\n", | |
"mae = np.mean(np.abs(y_test - y_pred))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 131, | |
"id": "20ff5901", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1132.848751805076" | |
] | |
}, | |
"execution_count": 131, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"mae" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 160, | |
"id": "e615c7e1", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Mean absolute error: 1132.87\n" | |
] | |
} | |
], | |
"source": [ | |
"print(f'Mean absolute error: {mae:.2f}')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 156, | |
"id": "d63a9b72", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.metrics import r2_score" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 157, | |
"id": "20e82abc", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Lasso(alpha=0.1)\n", | |
"r^2 on test data : -0.410549\n" | |
] | |
} | |
], | |
"source": [ | |
"r2_score_lasso = r2_score(y_test,y_pred)\n", | |
"print(lasso)\n", | |
"print(\"r^2 on test data : %f\" % r2_score_lasso)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "3e4314d4", | |
"metadata": {}, | |
"source": [ | |
"## another way of doing LESSO" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 171, | |
"id": "2e877a6c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Lasso(alpha=0.1)" | |
] | |
}, | |
"execution_count": 171, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Train the model\n", | |
"lasso.fit(X_train, y_train)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 174, | |
"id": "b688bc93", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Lasso(alpha=0.1)" | |
] | |
}, | |
"execution_count": 174, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Fit the Lasso model\n", | |
"lasso_model = Lasso(alpha=0.1)\n", | |
"lasso_model.fit(X_train, y_train)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 175, | |
"id": "285f6dfa", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Get the estimated coefficients\n", | |
"coefficients = lasso_model.coef_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 177, | |
"id": "b3709b52", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([0.00000000e+00, 5.27432406e-05, 5.21421806e-05, 5.37356777e-05])" | |
] | |
}, | |
"execution_count": 177, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"coefficients" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 178, | |
"id": "caa16594", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Get the feature names from the original dataframe\n", | |
"feature_names = df.columns[:-1]\n", | |
"\n", | |
"# Pair feature names with their coefficients\n", | |
"coef_pairs = zip(feature_names, lasso_model.coef_)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 179, | |
"id": "20cd034b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"#RIC: 0.0\n", | |
"Domain: 5.2743240560278315e-05\n", | |
"Date-Time: 5.2142180572714973e-05\n", | |
"Type: 5.3735677737564e-05\n" | |
] | |
} | |
], | |
"source": [ | |
"# Print the coefficients\n", | |
"for feature, coef in coef_pairs:\n", | |
" print(f\"{feature}: {coef}\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 180, | |
"id": "3bd0c430", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[0.00000000e+00 5.27432406e-05 5.21421806e-05 5.37356777e-05]\n" | |
] | |
} | |
], | |
"source": [ | |
"# Get estimated coefficients\n", | |
"coef = lasso.coef_\n", | |
"print(coef)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "18ecdd68", | |
"metadata": {}, | |
"source": [ | |
"# Out of sample forecast-Benchmark-Lasso" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "a624d03d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from sklearn.linear_model import LinearRegression, LassoCV\n", | |
"from sklearn.metrics import r2_score" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "5a2f2436", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Importing the dataset\n", | |
"data = pd.read_csv('ftse100.csv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "e04e4311", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Define the time window and lag parameters\n", | |
"est_window = 30 # minutes\n", | |
"pred_window = 3 # minutes\n", | |
"lags = 3 # number of lags to include in OLS regression" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "47620047", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>#RIC</th>\n", | |
" <th>Domain</th>\n", | |
" <th>Date-Time</th>\n", | |
" <th>Type</th>\n", | |
" <th>Last</th>\n", | |
" <th>Volume</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-03-14T15:37:00.000000000Z</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>299.5</td>\n", | |
" <td>25000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-03-20T14:51:00.000000000Z</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>20.0</td>\n", | |
" <td>100000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-05-14T13:49:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>6000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-10-16T16:04:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>25.0</td>\n", | |
" <td>50000.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-10-23T14:59:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>23000.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" #RIC Domain Date-Time Type \\\n", | |
"0 AAF.L Market Price 1996-03-14T15:37:00.000000000Z Intraday 1Min \n", | |
"1 AAF.L Market Price 1996-03-20T14:51:00.000000000Z Intraday 1Min \n", | |
"2 AAF.L Market Price 1996-05-14T13:49:00.000000000+01 Intraday 1Min \n", | |
"3 AAF.L Market Price 1996-10-16T16:04:00.000000000+01 Intraday 1Min \n", | |
"4 AAF.L Market Price 1996-10-23T14:59:00.000000000+01 Intraday 1Min \n", | |
"\n", | |
" Last Volume \n", | |
"0 299.5 25000.0 \n", | |
"1 20.0 100000.0 \n", | |
"2 30.0 6000.0 \n", | |
"3 25.0 50000.0 \n", | |
"4 30.0 23000.0 " | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "85b54311", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Calculate the daily returns\n", | |
"data['returns'] = data['Last'].pct_change()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "951f1cf0", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>#RIC</th>\n", | |
" <th>Domain</th>\n", | |
" <th>Date-Time</th>\n", | |
" <th>Type</th>\n", | |
" <th>Last</th>\n", | |
" <th>Volume</th>\n", | |
" <th>returns</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-03-14T15:37:00.000000000Z</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>299.5</td>\n", | |
" <td>25000.0</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-03-20T14:51:00.000000000Z</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>20.0</td>\n", | |
" <td>100000.0</td>\n", | |
" <td>-0.933222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-05-14T13:49:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>6000.0</td>\n", | |
" <td>0.500000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-10-16T16:04:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>25.0</td>\n", | |
" <td>50000.0</td>\n", | |
" <td>-0.166667</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-10-23T14:59:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>23000.0</td>\n", | |
" <td>0.200000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" #RIC Domain Date-Time Type \\\n", | |
"0 AAF.L Market Price 1996-03-14T15:37:00.000000000Z Intraday 1Min \n", | |
"1 AAF.L Market Price 1996-03-20T14:51:00.000000000Z Intraday 1Min \n", | |
"2 AAF.L Market Price 1996-05-14T13:49:00.000000000+01 Intraday 1Min \n", | |
"3 AAF.L Market Price 1996-10-16T16:04:00.000000000+01 Intraday 1Min \n", | |
"4 AAF.L Market Price 1996-10-23T14:59:00.000000000+01 Intraday 1Min \n", | |
"\n", | |
" Last Volume returns \n", | |
"0 299.5 25000.0 NaN \n", | |
"1 20.0 100000.0 -0.933222 \n", | |
"2 30.0 6000.0 0.500000 \n", | |
"3 25.0 50000.0 -0.166667 \n", | |
"4 30.0 23000.0 0.200000 " | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "6af6ebda", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"786431" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"##Split data into train and test\n", | |
"# Total dataset length\n", | |
"dataset_length = data.shape[0]\n", | |
"# Training dataset length\n", | |
"split = int(dataset_length * 0.75)\n", | |
"split" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "1bf2a7c0", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(786431, 7)\n", | |
"(262144, 7)\n" | |
] | |
} | |
], | |
"source": [ | |
"# Splitiing the X and y into train and test datasets\n", | |
"train = data[:split]\n", | |
"test = data[split:]\n", | |
"# Print the size of the train and test dataset\n", | |
"print(train.shape)\n", | |
"print(test.shape)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"id": "fdcbfe4e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>#RIC</th>\n", | |
" <th>Domain</th>\n", | |
" <th>Date-Time</th>\n", | |
" <th>Type</th>\n", | |
" <th>Last</th>\n", | |
" <th>Volume</th>\n", | |
" <th>returns</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-03-14T15:37:00.000000000Z</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>299.5</td>\n", | |
" <td>25000.0</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-03-20T14:51:00.000000000Z</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>20.0</td>\n", | |
" <td>100000.0</td>\n", | |
" <td>-0.933222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-05-14T13:49:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>6000.0</td>\n", | |
" <td>0.500000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-10-16T16:04:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>25.0</td>\n", | |
" <td>50000.0</td>\n", | |
" <td>-0.166667</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>AAF.L</td>\n", | |
" <td>Market Price</td>\n", | |
" <td>1996-10-23T14:59:00.000000000+01</td>\n", | |
" <td>Intraday 1Min</td>\n", | |
" <td>30.0</td>\n", | |
" <td>23000.0</td>\n", | |
" <td>0.200000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" #RIC Domain Date-Time Type \\\n", | |
"0 AAF.L Market Price 1996-03-14T15:37:00.000000000Z Intraday 1Min \n", | |
"1 AAF.L Market Price 1996-03-20T14:51:00.000000000Z Intraday 1Min \n", | |
"2 AAF.L Market Price 1996-05-14T13:49:00.000000000+01 Intraday 1Min \n", | |
"3 AAF.L Market Price 1996-10-16T16:04:00.000000000+01 Intraday 1Min \n", | |
"4 AAF.L Market Price 1996-10-23T14:59:00.000000000+01 Intraday 1Min \n", | |
"\n", | |
" Last Volume returns \n", | |
"0 299.5 25000.0 NaN \n", | |
"1 20.0 100000.0 -0.933222 \n", | |
"2 30.0 6000.0 0.500000 \n", | |
"3 25.0 50000.0 -0.166667 \n", | |
"4 30.0 23000.0 0.200000 " | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"id": "68e70619", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Create the OLS predictors matrix for the training data\n", | |
"ols_predictors = []\n", | |
"for i in range(lags):\n", | |
" ols_predictors.append(train['returns'].shift(i+1))\n", | |
"ols_predictors = pd.concat(ols_predictors, axis=1)\n", | |
"ols_predictors = ols_predictors.dropna()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"id": "df7f15f4", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>returns</th>\n", | |
" <th>returns</th>\n", | |
" <th>returns</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>-0.166667</td>\n", | |
" <td>0.500000</td>\n", | |
" <td>-0.933222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>0.200000</td>\n", | |
" <td>-0.166667</td>\n", | |
" <td>0.500000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>-0.100000</td>\n", | |
" <td>0.200000</td>\n", | |
" <td>-0.166667</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>0.018519</td>\n", | |
" <td>-0.100000</td>\n", | |
" <td>0.200000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>-0.054545</td>\n", | |
" <td>0.018519</td>\n", | |
" <td>-0.100000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>786426</th>\n", | |
" <td>0.002088</td>\n", | |
" <td>-0.001584</td>\n", | |
" <td>0.000334</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>786427</th>\n", | |
" <td>-0.000417</td>\n", | |
" <td>0.002088</td>\n", | |
" <td>-0.001584</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>786428</th>\n", | |
" <td>-0.000417</td>\n", | |
" <td>-0.000417</td>\n", | |
" <td>0.002088</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>786429</th>\n", | |
" <td>-0.000417</td>\n", | |
" <td>-0.000417</td>\n", | |
" <td>-0.000417</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>786430</th>\n", | |
" <td>-0.002503</td>\n", | |
" <td>-0.000417</td>\n", | |
" <td>-0.000417</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>786427 rows × 3 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" returns returns returns\n", | |
"4 -0.166667 0.500000 -0.933222\n", | |
"5 0.200000 -0.166667 0.500000\n", | |
"6 -0.100000 0.200000 -0.166667\n", | |
"7 0.018519 -0.100000 0.200000\n", | |
"8 -0.054545 0.018519 -0.100000\n", | |
"... ... ... ...\n", | |
"786426 0.002088 -0.001584 0.000334\n", | |
"786427 -0.000417 0.002088 -0.001584\n", | |
"786428 -0.000417 -0.000417 0.002088\n", | |
"786429 -0.000417 -0.000417 -0.000417\n", | |
"786430 -0.002503 -0.000417 -0.000417\n", | |
"\n", | |
"[786427 rows x 3 columns]" | |
] | |
}, | |
"execution_count": 55, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ols_predictors" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"id": "dce3d141", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Create the OLS response vector for the training data\n", | |
"ols_response = train['returns'].shift(-4)\n", | |
"ols_response = ols_response.dropna()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"id": "e05d801f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 0.200000\n", | |
"1 -0.100000\n", | |
"2 0.018519\n", | |
"3 -0.054545\n", | |
"4 -0.038462\n", | |
" ... \n", | |
"786422 -0.000417\n", | |
"786423 -0.000417\n", | |
"786424 -0.000417\n", | |
"786425 -0.002503\n", | |
"786426 0.000000\n", | |
"Name: returns, Length: 786427, dtype: float64" | |
] | |
}, | |
"execution_count": 57, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ols_response" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"id": "84aaa7fb", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"LinearRegression()" | |
] | |
}, | |
"execution_count": 27, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Fit the OLS regression model\n", | |
"ols_model = LinearRegression()\n", | |
"ols_model.fit(ols_predictors, ols_response)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"id": "122616f9", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Create the LASSO predictors matrix for the prediction data\n", | |
"lasso_predictors = []\n", | |
"for i in range(lags):\n", | |
" lasso_predictors.append(test['returns'].shift(i+1))\n", | |
"lasso_predictors = pd.concat(lasso_predictors, axis=1)\n", | |
"lasso_predictors = lasso_predictors.dropna()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"id": "ea630011", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Apply the OLS coefficients to the LASSO predictors matrix\n", | |
"lasso_predictors = lasso_predictors.mul(ols_model.coef_, axis=1).sum(axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"id": "0dd5dee7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Normalize the LASSO predictors to have mean zero and unit variance\n", | |
"lasso_predictors = (lasso_predictors - lasso_predictors.mean()) / lasso_predictors.std()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"id": "38d1cd79", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Create the LASSO response vector for the prediction data\n", | |
"lasso_response = test['returns'].shift(-3)\n", | |
"lasso_response = lasso_response.dropna()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"id": "e89937d7", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"LassoCV(cv=5)" | |
] | |
}, | |
"execution_count": 37, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Fit the LASSO regression model\n", | |
"lasso_model = LassoCV(cv=5)\n", | |
"lasso_model.fit(lasso_predictors.values.reshape(-1, 1), lasso_response)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"id": "ee2f77e1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Make the 1-minute-ahead return forecast for each stock\n", | |
"forecasts = {}\n", | |
"for i in range(len(test)):\n", | |
" # Extract the relevant data for the current prediction\n", | |
" train_data = test.iloc[i-lags:i]\n", | |
" pred_data_point = test.iloc[i][['returns']]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"id": "285de7ac", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Make the 1-minute-ahead return forecast for each stock\n", | |
"forecasts = {}\n", | |
"for i in range(len(test)):\n", | |
" lasso_predictor = test['returns'].iloc[-lags:].mul(ols_model.coef_).sum()\n", | |
" lasso_predictor = (lasso_predictor - lasso_predictors.mean()) / lasso_predictors.std()\n", | |
" forecast = ols_model.intercept_ + lasso_model.predict([[lasso_predictor]])\n", | |
" forecasts = forecast[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"id": "604615ed", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "ValueError", | |
"evalue": "Found input variables with inconsistent numbers of samples: [262142, 1]", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m/tmp/ipykernel_2443267/759002749.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0my_true\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'returns'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshift\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdropna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0my_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mforecasts\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mr2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mr2_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Out-of-sample R-squared: {r2:.4f}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/opt/ohpc/pub/apps/anaconda/lib/python3.9/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;31m# extra_args > 0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/opt/ohpc/pub/apps/anaconda/lib/python3.9/site-packages/sklearn/metrics/_regression.py\u001b[0m in \u001b[0;36mr2_score\u001b[0;34m(y_true, y_pred, sample_weight, multioutput)\u001b[0m\n\u001b[1;32m 674\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m3.0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 675\u001b[0m \"\"\"\n\u001b[0;32m--> 676\u001b[0;31m y_type, y_true, y_pred, multioutput = _check_reg_targets(\n\u001b[0m\u001b[1;32m 677\u001b[0m y_true, y_pred, multioutput)\n\u001b[1;32m 678\u001b[0m \u001b[0mcheck_consistent_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/opt/ohpc/pub/apps/anaconda/lib/python3.9/site-packages/sklearn/metrics/_regression.py\u001b[0m in \u001b[0;36m_check_reg_targets\u001b[0;34m(y_true, y_pred, multioutput, dtype)\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mdtype\u001b[0m \u001b[0margument\u001b[0m \u001b[0mpassed\u001b[0m \u001b[0mto\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 87\u001b[0m \"\"\"\n\u001b[0;32m---> 88\u001b[0;31m \u001b[0mcheck_consistent_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 89\u001b[0m \u001b[0my_true\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0my_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_pred\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/opt/ohpc/pub/apps/anaconda/lib/python3.9/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_consistent_length\u001b[0;34m(*arrays)\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[0muniques\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munique\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlengths\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 318\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0muniques\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 319\u001b[0;31m raise ValueError(\"Found input variables with inconsistent numbers of\"\n\u001b[0m\u001b[1;32m 320\u001b[0m \" samples: %r\" % [int(l) for l in lengths])\n\u001b[1;32m 321\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mValueError\u001b[0m: Found input variables with inconsistent numbers of samples: [262142, 1]" | |
] | |
} | |
], | |
"source": [ | |
"# Evaluate the out-of-sample performance\n", | |
"y_true = test['returns'].shift(-2).dropna()\n", | |
"y_pred = pd.Series(forecasts)\n", | |
"r2 = r2_score(y_true, y_pred)\n", | |
"print(f\"Out-of-sample R-squared: {r2:.4f}\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 58, | |
"id": "f5ce5f69", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 6.410831\n", | |
"dtype: float64" | |
] | |
}, | |
"execution_count": 58, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"y_pred" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "2677ec0d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment