Skip to content

Instantly share code, notes, and snippets.

@riqbal-k
Created April 19, 2023 13:08
Show Gist options
  • Save riqbal-k/f6cb4c53425e4b0a97369503fe893738 to your computer and use it in GitHub Desktop.
Save riqbal-k/f6cb4c53425e4b0a97369503fe893738 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "f4ed2ffd",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_2472429/1036767244.py:10: FutureWarning: The pandas.datetime class is deprecated and will be removed from pandas in a future version. Import from datetime module instead.\n",
" from pandas import datetime\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import datetime as dt\n",
"import alpha_vantage as av\n",
"import statsmodels\n",
"from pytrends.request import TrendReq\n",
"from datetime import datetime\n",
"from alpha_vantage.timeseries import TimeSeries\n",
"from pandas import datetime\n",
"import math, time\n",
"import itertools\n",
"import datetime\n",
"from operator import itemgetter\n",
"from sklearn.metrics import mean_squared_error\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"from math import sqrt\n",
"import torch\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8f2bca13",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.model_selection import RepeatedKFold\n",
"from sklearn.linear_model import Lasso\n",
"from sklearn.preprocessing import StandardScaler\n",
"import statsmodels.api as sm\n",
"from statsmodels.api import OLS\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.linear_model import LassoCV\n",
"from sklearn.metrics import r2_score"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "7a285b32",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/home/mnf13'"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.getcwd()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "d42bc0a4",
"metadata": {},
"outputs": [],
"source": [
"# Importing the dataset\n",
"data = pd.read_csv('ftse100.csv')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "f5a7265a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>#RIC</th>\n",
" <th>Domain</th>\n",
" <th>Date-Time</th>\n",
" <th>Type</th>\n",
" <th>Last</th>\n",
" <th>Volume</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-03-14T15:37:00.000000000Z</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>299.5</td>\n",
" <td>25000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-03-20T14:51:00.000000000Z</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>20.0</td>\n",
" <td>100000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-05-14T13:49:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>6000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-10-16T16:04:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>25.0</td>\n",
" <td>50000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-10-23T14:59:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>23000.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" #RIC Domain Date-Time Type \\\n",
"0 AAF.L Market Price 1996-03-14T15:37:00.000000000Z Intraday 1Min \n",
"1 AAF.L Market Price 1996-03-20T14:51:00.000000000Z Intraday 1Min \n",
"2 AAF.L Market Price 1996-05-14T13:49:00.000000000+01 Intraday 1Min \n",
"3 AAF.L Market Price 1996-10-16T16:04:00.000000000+01 Intraday 1Min \n",
"4 AAF.L Market Price 1996-10-23T14:59:00.000000000+01 Intraday 1Min \n",
"\n",
" Last Volume \n",
"0 299.5 25000.0 \n",
"1 20.0 100000.0 \n",
"2 30.0 6000.0 \n",
"3 25.0 50000.0 \n",
"4 30.0 23000.0 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "56bf44be",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>#RIC</th>\n",
" <th>Domain</th>\n",
" <th>Date-Time</th>\n",
" <th>Type</th>\n",
" <th>Last</th>\n",
" <th>Volume</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1048570</th>\n",
" <td>AAL.L</td>\n",
" <td>Market Price</td>\n",
" <td>2009-04-06T14:21:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>1332.0</td>\n",
" <td>16166.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1048571</th>\n",
" <td>AAL.L</td>\n",
" <td>Market Price</td>\n",
" <td>2009-04-06T14:22:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>1330.0</td>\n",
" <td>19021.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1048572</th>\n",
" <td>AAL.L</td>\n",
" <td>Market Price</td>\n",
" <td>2009-04-06T14:23:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>1332.0</td>\n",
" <td>45759.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1048573</th>\n",
" <td>AAL.L</td>\n",
" <td>Market Price</td>\n",
" <td>2009-04-06T14:24:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>1331.0</td>\n",
" <td>13481.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1048574</th>\n",
" <td>AAL.L</td>\n",
" <td>Market Price</td>\n",
" <td>2009-04-06T14:25:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>1333.0</td>\n",
" <td>16301.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" #RIC Domain Date-Time Type \\\n",
"1048570 AAL.L Market Price 2009-04-06T14:21:00.000000000+01 Intraday 1Min \n",
"1048571 AAL.L Market Price 2009-04-06T14:22:00.000000000+01 Intraday 1Min \n",
"1048572 AAL.L Market Price 2009-04-06T14:23:00.000000000+01 Intraday 1Min \n",
"1048573 AAL.L Market Price 2009-04-06T14:24:00.000000000+01 Intraday 1Min \n",
"1048574 AAL.L Market Price 2009-04-06T14:25:00.000000000+01 Intraday 1Min \n",
"\n",
" Last Volume \n",
"1048570 1332.0 16166.0 \n",
"1048571 1330.0 19021.0 \n",
"1048572 1332.0 45759.0 \n",
"1048573 1331.0 13481.0 \n",
"1048574 1333.0 16301.0 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.tail()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "f510c6fc",
"metadata": {},
"outputs": [],
"source": [
"# Calculate the daily returns\n",
"data['cobit_returns'] = data['Last'].pct_change()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "78e55916",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>#RIC</th>\n",
" <th>Domain</th>\n",
" <th>Date-Time</th>\n",
" <th>Type</th>\n",
" <th>Last</th>\n",
" <th>Volume</th>\n",
" <th>cobit_returns</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-03-14T15:37:00.000000000Z</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>299.5</td>\n",
" <td>25000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-03-20T14:51:00.000000000Z</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>20.0</td>\n",
" <td>100000.0</td>\n",
" <td>-0.933222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-05-14T13:49:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>6000.0</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-10-16T16:04:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>25.0</td>\n",
" <td>50000.0</td>\n",
" <td>-0.166667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-10-23T14:59:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>23000.0</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" #RIC Domain Date-Time Type \\\n",
"0 AAF.L Market Price 1996-03-14T15:37:00.000000000Z Intraday 1Min \n",
"1 AAF.L Market Price 1996-03-20T14:51:00.000000000Z Intraday 1Min \n",
"2 AAF.L Market Price 1996-05-14T13:49:00.000000000+01 Intraday 1Min \n",
"3 AAF.L Market Price 1996-10-16T16:04:00.000000000+01 Intraday 1Min \n",
"4 AAF.L Market Price 1996-10-23T14:59:00.000000000+01 Intraday 1Min \n",
"\n",
" Last Volume cobit_returns \n",
"0 299.5 25000.0 NaN \n",
"1 20.0 100000.0 -0.933222 \n",
"2 30.0 6000.0 0.500000 \n",
"3 25.0 50000.0 -0.166667 \n",
"4 30.0 23000.0 0.200000 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "3fa2604f",
"metadata": {},
"outputs": [],
"source": [
"data['cobit_lag1'] = data['cobit_returns'].shift(1)\n",
"data['cobit_lag2'] = data['cobit_returns'].shift(2)\n",
"data['cobit_lag3'] = data['cobit_returns'].shift(3)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "d8662ebd",
"metadata": {},
"outputs": [],
"source": [
"data = data[[\"cobit_returns\",\"cobit_lag1\",\"cobit_lag2\",\"cobit_lag3\"]]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "66825e77",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"cobit_returns 1\n",
"cobit_lag1 2\n",
"cobit_lag2 3\n",
"cobit_lag3 4\n",
"dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check for invalid values\n",
"data.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "fd77cda7",
"metadata": {},
"outputs": [],
"source": [
"data = data.fillna(method='ffill').fillna(method='bfill')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "08398678",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cobit_returns 0\n",
"cobit_lag1 0\n",
"cobit_lag2 0\n",
"cobit_lag3 0\n",
"dtype: int64\n"
]
}
],
"source": [
"# Check for invalid values\n",
"print(np.isnan(data).sum())"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "189136b6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"786431"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"##Split data into train and test\n",
"# Total dataset length\n",
"dataset_length = data.shape[0]\n",
"# Training dataset length\n",
"split = int(dataset_length * 0.75)\n",
"split"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "f380a52f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(786431, 4)\n",
"(262144, 4)\n"
]
}
],
"source": [
"# Splitiing the X and y into train and test datasets\n",
"train = data[:split]\n",
"test = data[split:]\n",
"# Print the size of the train and test dataset\n",
"print(train.shape)\n",
"print(test.shape)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "22d95e15",
"metadata": {},
"outputs": [],
"source": [
"#create x and y variables for both test and train data\n",
"x_train = train.drop('cobit_returns', axis=1)\n",
"y_train = train['cobit_returns']\n",
"x_test = test.drop('cobit_returns', axis=1)\n",
"y_test = test['cobit_returns']"
]
},
{
"cell_type": "markdown",
"id": "b509bc72",
"metadata": {},
"source": [
"# Lasso-Lasso-Out of smapkle predictions"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "dac9974d",
"metadata": {},
"outputs": [],
"source": [
"#Lasso on train data\n",
"alpha = 0.1\n",
"lasso = Lasso(alpha=alpha)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "44305077",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.6847539398318078\n",
"0.007022121544042248\n"
]
}
],
"source": [
"#prediction on test data\n",
"#check r2 on train data\n",
"model_lasso = Lasso(alpha=0.01)\n",
"\n",
"model_lasso.fit(x_test, y_test) \n",
"\n",
"pred_test_lasso= model_lasso.predict(x_test)\n",
"\n",
"print(np.sqrt(mean_squared_error(y_test,pred_test_lasso)))\n",
"\n",
"print(r2_score(y_test, pred_test_lasso))\n"
]
},
{
"cell_type": "markdown",
"id": "c61c1b32",
"metadata": {},
"source": [
"# Benchmark-Lasso-Out of smaple predictions"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "6a9cb29c",
"metadata": {},
"outputs": [],
"source": [
"# Define the time window and lag parameters\n",
"est_window = 30 # minutes\n",
"pred_window = 3 # minutes\n",
"lags = 3 # number of lags to include in OLS regression"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "b59937a5",
"metadata": {},
"outputs": [],
"source": [
"# Create the OLS predictors matrix for the training data\n",
"ols_predictors = []\n",
"for i in range(lags):\n",
" ols_predictors.append(train['cobit_returns'].shift(i+1))\n",
"ols_predictors = pd.concat(ols_predictors, axis=1)\n",
"ols_predictors = ols_predictors.dropna()\n"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "0d6867b4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>cobit_returns</th>\n",
" <th>cobit_returns</th>\n",
" <th>cobit_returns</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.500000</td>\n",
" <td>-0.933222</td>\n",
" <td>-0.933222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-0.166667</td>\n",
" <td>0.500000</td>\n",
" <td>-0.933222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.200000</td>\n",
" <td>-0.166667</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>-0.100000</td>\n",
" <td>0.200000</td>\n",
" <td>-0.166667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>0.018519</td>\n",
" <td>-0.100000</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>786426</th>\n",
" <td>0.002088</td>\n",
" <td>-0.001584</td>\n",
" <td>0.000334</td>\n",
" </tr>\n",
" <tr>\n",
" <th>786427</th>\n",
" <td>-0.000417</td>\n",
" <td>0.002088</td>\n",
" <td>-0.001584</td>\n",
" </tr>\n",
" <tr>\n",
" <th>786428</th>\n",
" <td>-0.000417</td>\n",
" <td>-0.000417</td>\n",
" <td>0.002088</td>\n",
" </tr>\n",
" <tr>\n",
" <th>786429</th>\n",
" <td>-0.000417</td>\n",
" <td>-0.000417</td>\n",
" <td>-0.000417</td>\n",
" </tr>\n",
" <tr>\n",
" <th>786430</th>\n",
" <td>-0.002503</td>\n",
" <td>-0.000417</td>\n",
" <td>-0.000417</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>786428 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" cobit_returns cobit_returns cobit_returns\n",
"3 0.500000 -0.933222 -0.933222\n",
"4 -0.166667 0.500000 -0.933222\n",
"5 0.200000 -0.166667 0.500000\n",
"6 -0.100000 0.200000 -0.166667\n",
"7 0.018519 -0.100000 0.200000\n",
"... ... ... ...\n",
"786426 0.002088 -0.001584 0.000334\n",
"786427 -0.000417 0.002088 -0.001584\n",
"786428 -0.000417 -0.000417 0.002088\n",
"786429 -0.000417 -0.000417 -0.000417\n",
"786430 -0.002503 -0.000417 -0.000417\n",
"\n",
"[786428 rows x 3 columns]"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ols_predictors"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "71dfdbe5",
"metadata": {},
"outputs": [],
"source": [
"# Create the OLS response vector for the training data\n",
"ols_response = train['cobit_returns'].shift(-3)\n",
"ols_response = ols_response.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "5e7c471b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LinearRegression()"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Fit the OLS regression model\n",
"ols_model = LinearRegression()\n",
"ols_model.fit(ols_predictors, ols_response)"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "5f62ed0c",
"metadata": {},
"outputs": [],
"source": [
"# Create the LASSO predictors matrix for the prediction data\n",
"lasso_predictors = []\n",
"for i in range(lags):\n",
" lasso_predictors.append(test['cobit_returns'].shift(i+1))\n",
"lasso_predictors = pd.concat(lasso_predictors, axis=1)\n",
"lasso_predictors = lasso_predictors.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "6e076fce",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>cobit_returns</th>\n",
" <th>cobit_returns</th>\n",
" <th>cobit_returns</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>786434</th>\n",
" <td>-0.000418</td>\n",
" <td>0.000000</td>\n",
" <td>0.000836</td>\n",
" </tr>\n",
" <tr>\n",
" <th>786435</th>\n",
" <td>0.000000</td>\n",
" <td>-0.000418</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>786436</th>\n",
" <td>0.000318</td>\n",
" <td>0.000000</td>\n",
" <td>-0.000418</td>\n",
" </tr>\n",
" <tr>\n",
" <th>786437</th>\n",
" <td>0.000936</td>\n",
" <td>0.000318</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>786438</th>\n",
" <td>-0.001253</td>\n",
" <td>0.000936</td>\n",
" <td>0.000318</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1048570</th>\n",
" <td>-0.004478</td>\n",
" <td>-0.002976</td>\n",
" <td>0.005988</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1048571</th>\n",
" <td>-0.001499</td>\n",
" <td>-0.004478</td>\n",
" <td>-0.002976</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1048572</th>\n",
" <td>-0.001502</td>\n",
" <td>-0.001499</td>\n",
" <td>-0.004478</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1048573</th>\n",
" <td>0.001504</td>\n",
" <td>-0.001502</td>\n",
" <td>-0.001499</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1048574</th>\n",
" <td>-0.000751</td>\n",
" <td>0.001504</td>\n",
" <td>-0.001502</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>262141 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" cobit_returns cobit_returns cobit_returns\n",
"786434 -0.000418 0.000000 0.000836\n",
"786435 0.000000 -0.000418 0.000000\n",
"786436 0.000318 0.000000 -0.000418\n",
"786437 0.000936 0.000318 0.000000\n",
"786438 -0.001253 0.000936 0.000318\n",
"... ... ... ...\n",
"1048570 -0.004478 -0.002976 0.005988\n",
"1048571 -0.001499 -0.004478 -0.002976\n",
"1048572 -0.001502 -0.001499 -0.004478\n",
"1048573 0.001504 -0.001502 -0.001499\n",
"1048574 -0.000751 0.001504 -0.001502\n",
"\n",
"[262141 rows x 3 columns]"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lasso_predictors"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "5ba67670",
"metadata": {},
"outputs": [],
"source": [
"# Apply the OLS coefficients to the LASSO predictors matrix\n",
"lasso_predictors = lasso_predictors.mul(ols_model.coef_, axis=1).sum(axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "2121007f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"786434 -1.460073e-09\n",
"786435 1.728927e-09\n",
"786436 2.079490e-10\n",
"786437 -5.795036e-09\n",
"786438 8.075894e-10\n",
" ... \n",
"1048570 8.973827e-09\n",
"1048571 3.801197e-08\n",
"1048572 3.191100e-08\n",
"1048573 5.216834e-09\n",
"1048574 3.582629e-09\n",
"Length: 262141, dtype: float64"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lasso_predictors"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "8b33b0cf",
"metadata": {},
"outputs": [],
"source": [
"# Normalize the LASSO predictors to have mean zero and unit variance\n",
"lasso_predictors = (lasso_predictors - lasso_predictors.mean()) / lasso_predictors.std()"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "5eeb2c90",
"metadata": {},
"outputs": [],
"source": [
"# Create the LASSO response vector for the prediction data\n",
"lasso_response = test['cobit_returns'].shift(-3)\n",
"lasso_response = lasso_response.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "f69cf6de",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"786431 0.000000\n",
"786432 -0.000418\n",
"786433 0.000000\n",
"786434 0.000318\n",
"786435 0.000936\n",
" ... \n",
"1048569 -0.001499\n",
"1048570 -0.001502\n",
"1048571 0.001504\n",
"1048572 -0.000751\n",
"1048573 0.001503\n",
"Name: cobit_returns, Length: 262143, dtype: float64"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lasso_response"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "2f1db3b1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LassoCV(cv=5)"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Fit the LASSO regression model\n",
"lasso_model = LassoCV(cv=5)\n",
"lasso_model.fit(lasso_predictors.values.reshape(-1, 1), lasso_response)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "0729020d",
"metadata": {},
"outputs": [],
"source": [
"# Make the 1-minute-ahead return forecast for each stock\n",
"forecasts = []\n",
"for i in range(lags, len(test)):\n",
" lasso_predictor = test.iloc[i-lags:i]['cobit_returns'].mul(ols_model.coef_).sum()\n",
" lasso_predictor = (lasso_predictor - lasso_predictors.mean()) / lasso_predictors.std()\n",
" forecast = ols_model.intercept_ + lasso_model.predict([[lasso_predictor]])\n",
" forecasts.append(forecast[0])"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "043df139",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Out-of-sample R-squared: -86.9036\n"
]
}
],
"source": [
"# Evaluate the out-of-sample performance\n",
"y_true = test['cobit_returns'].shift(-3).dropna()\n",
"y_pred = pd.Series(forecasts)\n",
"r2 = r2_score(y_true, y_pred)\n",
"print(f\"Out-of-sample R-squared: {r2:.4f}\")"
]
},
{
"cell_type": "markdown",
"id": "997c70ca",
"metadata": {},
"source": [
"# Autoregressive Models"
]
},
{
"cell_type": "code",
"execution_count": 87,
"id": "43638853",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import statsmodels.api as sm\n",
"from sklearn.metrics import r2_score, mean_absolute_error"
]
},
{
"cell_type": "code",
"execution_count": 88,
"id": "74d6056e",
"metadata": {},
"outputs": [],
"source": [
"# Importing the dataset\n",
"data = pd.read_csv('ftse100.csv')"
]
},
{
"cell_type": "code",
"execution_count": 89,
"id": "49ad1cde",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>#RIC</th>\n",
" <th>Domain</th>\n",
" <th>Date-Time</th>\n",
" <th>Type</th>\n",
" <th>Last</th>\n",
" <th>Volume</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-03-14T15:37:00.000000000Z</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>299.5</td>\n",
" <td>25000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-03-20T14:51:00.000000000Z</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>20.0</td>\n",
" <td>100000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-05-14T13:49:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>6000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-10-16T16:04:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>25.0</td>\n",
" <td>50000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-10-23T14:59:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>23000.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" #RIC Domain Date-Time Type \\\n",
"0 AAF.L Market Price 1996-03-14T15:37:00.000000000Z Intraday 1Min \n",
"1 AAF.L Market Price 1996-03-20T14:51:00.000000000Z Intraday 1Min \n",
"2 AAF.L Market Price 1996-05-14T13:49:00.000000000+01 Intraday 1Min \n",
"3 AAF.L Market Price 1996-10-16T16:04:00.000000000+01 Intraday 1Min \n",
"4 AAF.L Market Price 1996-10-23T14:59:00.000000000+01 Intraday 1Min \n",
"\n",
" Last Volume \n",
"0 299.5 25000.0 \n",
"1 20.0 100000.0 \n",
"2 30.0 6000.0 \n",
"3 25.0 50000.0 \n",
"4 30.0 23000.0 "
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 91,
"id": "a4a8b2ff",
"metadata": {},
"outputs": [],
"source": [
"# Calculate the cobit daily returns\n",
"data['returns'] = data['Last'].pct_change()"
]
},
{
"cell_type": "code",
"execution_count": 92,
"id": "bdc4e75e",
"metadata": {},
"outputs": [],
"source": [
"# create lagged return variables\n",
"data['lag_1'] = data['returns'].shift(1)\n",
"data['lag_2'] = data['returns'].shift(2)\n",
"data['lag_3'] = data['returns'].shift(3)"
]
},
{
"cell_type": "code",
"execution_count": 93,
"id": "9bbcc4c0",
"metadata": {},
"outputs": [],
"source": [
"#fill in missing values\n",
"data = data.fillna(method='ffill').fillna(method='bfill')"
]
},
{
"cell_type": "code",
"execution_count": 95,
"id": "e6652999",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(786431, 10)\n",
"(262144, 10)\n"
]
}
],
"source": [
"# Splitiing the X and y into train and test datasets\n",
"train = data[:split]\n",
"test = data[split:]\n",
"# Print the size of the train and test dataset\n",
"print(train.shape)\n",
"print(test.shape)"
]
},
{
"cell_type": "code",
"execution_count": 96,
"id": "39231b16",
"metadata": {},
"outputs": [],
"source": [
"# fit autoregressive model to training data\n",
"model = sm.OLS(train['returns'], train[['lag_1', 'lag_2', 'lag_3']])\n",
"results = model.fit()"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "86af4e2d",
"metadata": {},
"outputs": [],
"source": [
"# make predictions on test data\n",
"predictions = results.predict(test_data[['lag_1', 'lag_2', 'lag_3']])"
]
},
{
"cell_type": "code",
"execution_count": 98,
"id": "cbd014f4",
"metadata": {},
"outputs": [],
"source": [
"# calculate R-squared and MAE\n",
"r2 = r2_score(test_data['returns'], predictions)\n",
"mae = mean_absolute_error(test_data['returns'], predictions)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"id": "dc4edc09",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"R-squared: -0.00981647420755305\n",
"MAE: 0.0014406552381127644\n"
]
}
],
"source": [
"print(\"R-squared:\", r2)\n",
"print(\"MAE:\", mae)"
]
},
{
"cell_type": "markdown",
"id": "18ee7074",
"metadata": {},
"source": [
"# AR(1)"
]
},
{
"cell_type": "code",
"execution_count": 101,
"id": "07958e49",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"R-squared: -0.00981647414726261\n",
"MAE: 0.001440655238121796\n"
]
}
],
"source": [
"# fit autoregressive model to training data\n",
"model = sm.OLS(train['returns'], train[['lag_1']])\n",
"results = model.fit()\n",
"# make predictions on test data\n",
"predictions = results.predict(test_data[['lag_1']])\n",
"# calculate R-squared and MAE\n",
"r2 = r2_score(test_data['returns'], predictions)\n",
"mae = mean_absolute_error(test_data['returns'], predictions)\n",
"print(\"R-squared:\", r2)\n",
"print(\"MAE:\", mae)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7e3e9e00",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment