Skip to content

Instantly share code, notes, and snippets.

@riqbal-k
Created April 19, 2023 12:45
Show Gist options
  • Save riqbal-k/59f32a8482e1b2739ba479dc1e051ff6 to your computer and use it in GitHub Desktop.
Save riqbal-k/59f32a8482e1b2739ba479dc1e051ff6 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 142,
"id": "1f0641fc",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_2403777/1036767244.py:10: FutureWarning: The pandas.datetime class is deprecated and will be removed from pandas in a future version. Import from datetime module instead.\n",
" from pandas import datetime\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import datetime as dt\n",
"import alpha_vantage as av\n",
"import statsmodels\n",
"from pytrends.request import TrendReq\n",
"from datetime import datetime\n",
"from alpha_vantage.timeseries import TimeSeries\n",
"from pandas import datetime\n",
"import math, time\n",
"import itertools\n",
"import datetime\n",
"from operator import itemgetter\n",
"from sklearn.metrics import mean_squared_error\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"from math import sqrt\n",
"import torch\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "2fb69b17",
"metadata": {},
"outputs": [],
"source": [
"# Import the os module\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "3a886f2b",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.model_selection import RepeatedKFold\n",
"from sklearn.linear_model import Lasso"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ececf4bc",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import StandardScaler"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "0c585520",
"metadata": {},
"outputs": [],
"source": [
"import statsmodels.api as sm"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "6218f844",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/home/mnf13'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.getcwd()"
]
},
{
"cell_type": "code",
"execution_count": 159,
"id": "f6043974",
"metadata": {},
"outputs": [],
"source": [
"# Importing the dataset\n",
"data = pd.read_csv('ftse100.csv')"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "936c05ae",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>#RIC</th>\n",
" <th>Domain</th>\n",
" <th>Date-Time</th>\n",
" <th>Type</th>\n",
" <th>Last</th>\n",
" <th>Volume</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-03-14T15:37:00.000000000Z</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>299.5</td>\n",
" <td>25000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-03-20T14:51:00.000000000Z</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>20.0</td>\n",
" <td>100000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-05-14T13:49:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>6000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-10-16T16:04:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>25.0</td>\n",
" <td>50000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-10-23T14:59:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>23000.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" #RIC Domain Date-Time Type \\\n",
"0 AAF.L Market Price 1996-03-14T15:37:00.000000000Z Intraday 1Min \n",
"1 AAF.L Market Price 1996-03-20T14:51:00.000000000Z Intraday 1Min \n",
"2 AAF.L Market Price 1996-05-14T13:49:00.000000000+01 Intraday 1Min \n",
"3 AAF.L Market Price 1996-10-16T16:04:00.000000000+01 Intraday 1Min \n",
"4 AAF.L Market Price 1996-10-23T14:59:00.000000000+01 Intraday 1Min \n",
"\n",
" Last Volume \n",
"0 299.5 25000.0 \n",
"1 20.0 100000.0 \n",
"2 30.0 6000.0 \n",
"3 25.0 50000.0 \n",
"4 30.0 23000.0 "
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 164,
"id": "19db7407",
"metadata": {},
"outputs": [],
"source": [
"# convert Date-Time to datetime and set as index\n",
"data['Date-Time'] = pd.to_datetime(data['Date-Time'], format='%Y-%m-%d %H:%M:%S')\n",
"data.set_index('Date-Time', inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 160,
"id": "83726341",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>#RIC</th>\n",
" <th>Domain</th>\n",
" <th>Date-Time</th>\n",
" <th>Type</th>\n",
" <th>Last</th>\n",
" <th>Volume</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-03-14T15:37:00.000000000Z</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>299.5</td>\n",
" <td>25000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-03-20T14:51:00.000000000Z</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>20.0</td>\n",
" <td>100000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-05-14T13:49:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>6000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-10-16T16:04:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>25.0</td>\n",
" <td>50000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-10-23T14:59:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>23000.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" #RIC Domain Date-Time Type \\\n",
"0 AAF.L Market Price 1996-03-14T15:37:00.000000000Z Intraday 1Min \n",
"1 AAF.L Market Price 1996-03-20T14:51:00.000000000Z Intraday 1Min \n",
"2 AAF.L Market Price 1996-05-14T13:49:00.000000000+01 Intraday 1Min \n",
"3 AAF.L Market Price 1996-10-16T16:04:00.000000000+01 Intraday 1Min \n",
"4 AAF.L Market Price 1996-10-23T14:59:00.000000000+01 Intraday 1Min \n",
"\n",
" Last Volume \n",
"0 299.5 25000.0 \n",
"1 20.0 100000.0 \n",
"2 30.0 6000.0 \n",
"3 25.0 50000.0 \n",
"4 30.0 23000.0 "
]
},
"execution_count": 160,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 161,
"id": "54370b6d",
"metadata": {},
"outputs": [],
"source": [
"# Calculate the daily returns\n",
"data['cobit_returns'] = data['Last'].pct_change()"
]
},
{
"cell_type": "code",
"execution_count": 162,
"id": "25c13cc3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>#RIC</th>\n",
" <th>Domain</th>\n",
" <th>Date-Time</th>\n",
" <th>Type</th>\n",
" <th>Last</th>\n",
" <th>Volume</th>\n",
" <th>cobit_returns</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-03-14T15:37:00.000000000Z</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>299.5</td>\n",
" <td>25000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-03-20T14:51:00.000000000Z</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>20.0</td>\n",
" <td>100000.0</td>\n",
" <td>-0.933222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-05-14T13:49:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>6000.0</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-10-16T16:04:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>25.0</td>\n",
" <td>50000.0</td>\n",
" <td>-0.166667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-10-23T14:59:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>23000.0</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1048570</th>\n",
" <td>AAL.L</td>\n",
" <td>Market Price</td>\n",
" <td>2009-04-06T14:21:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>1332.0</td>\n",
" <td>16166.0</td>\n",
" <td>-0.001499</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1048571</th>\n",
" <td>AAL.L</td>\n",
" <td>Market Price</td>\n",
" <td>2009-04-06T14:22:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>1330.0</td>\n",
" <td>19021.0</td>\n",
" <td>-0.001502</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1048572</th>\n",
" <td>AAL.L</td>\n",
" <td>Market Price</td>\n",
" <td>2009-04-06T14:23:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>1332.0</td>\n",
" <td>45759.0</td>\n",
" <td>0.001504</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1048573</th>\n",
" <td>AAL.L</td>\n",
" <td>Market Price</td>\n",
" <td>2009-04-06T14:24:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>1331.0</td>\n",
" <td>13481.0</td>\n",
" <td>-0.000751</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1048574</th>\n",
" <td>AAL.L</td>\n",
" <td>Market Price</td>\n",
" <td>2009-04-06T14:25:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>1333.0</td>\n",
" <td>16301.0</td>\n",
" <td>0.001503</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1048575 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" #RIC Domain Date-Time Type \\\n",
"0 AAF.L Market Price 1996-03-14T15:37:00.000000000Z Intraday 1Min \n",
"1 AAF.L Market Price 1996-03-20T14:51:00.000000000Z Intraday 1Min \n",
"2 AAF.L Market Price 1996-05-14T13:49:00.000000000+01 Intraday 1Min \n",
"3 AAF.L Market Price 1996-10-16T16:04:00.000000000+01 Intraday 1Min \n",
"4 AAF.L Market Price 1996-10-23T14:59:00.000000000+01 Intraday 1Min \n",
"... ... ... ... ... \n",
"1048570 AAL.L Market Price 2009-04-06T14:21:00.000000000+01 Intraday 1Min \n",
"1048571 AAL.L Market Price 2009-04-06T14:22:00.000000000+01 Intraday 1Min \n",
"1048572 AAL.L Market Price 2009-04-06T14:23:00.000000000+01 Intraday 1Min \n",
"1048573 AAL.L Market Price 2009-04-06T14:24:00.000000000+01 Intraday 1Min \n",
"1048574 AAL.L Market Price 2009-04-06T14:25:00.000000000+01 Intraday 1Min \n",
"\n",
" Last Volume cobit_returns \n",
"0 299.5 25000.0 NaN \n",
"1 20.0 100000.0 -0.933222 \n",
"2 30.0 6000.0 0.500000 \n",
"3 25.0 50000.0 -0.166667 \n",
"4 30.0 23000.0 0.200000 \n",
"... ... ... ... \n",
"1048570 1332.0 16166.0 -0.001499 \n",
"1048571 1330.0 19021.0 -0.001502 \n",
"1048572 1332.0 45759.0 0.001504 \n",
"1048573 1331.0 13481.0 -0.000751 \n",
"1048574 1333.0 16301.0 0.001503 \n",
"\n",
"[1048575 rows x 7 columns]"
]
},
"execution_count": 162,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "markdown",
"id": "bb16d1be",
"metadata": {},
"source": [
"# Linear Regression for one mintue a head predictions"
]
},
{
"cell_type": "code",
"execution_count": 166,
"id": "b7c4b06f",
"metadata": {},
"outputs": [],
"source": [
"# Create lagged versions of the COBIT returns\n",
"data['cobit_lag1'] = data['cobit_returns'].shift(1)\n",
"data['cobit_lag2'] = data['cobit_returns'].shift(2)\n",
"data['cobit_lag3'] = data['cobit_returns'].shift(3)"
]
},
{
"cell_type": "code",
"execution_count": 167,
"id": "d5882034",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>#RIC</th>\n",
" <th>Domain</th>\n",
" <th>Type</th>\n",
" <th>Last</th>\n",
" <th>Volume</th>\n",
" <th>cobit_returns</th>\n",
" <th>cobit_lag1</th>\n",
" <th>cobit_lag2</th>\n",
" <th>cobit_lag3</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Date-Time</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1996-03-14 15:37:00+00:00</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>299.5</td>\n",
" <td>25000.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1996-03-20 14:51:00+00:00</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>20.0</td>\n",
" <td>100000.0</td>\n",
" <td>-0.933222</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1996-05-14 13:49:00+01:00</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>6000.0</td>\n",
" <td>0.500000</td>\n",
" <td>-0.933222</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1996-10-16 16:04:00+01:00</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>25.0</td>\n",
" <td>50000.0</td>\n",
" <td>-0.166667</td>\n",
" <td>0.500000</td>\n",
" <td>-0.933222</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1996-10-23 14:59:00+01:00</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>23000.0</td>\n",
" <td>0.200000</td>\n",
" <td>-0.166667</td>\n",
" <td>0.500000</td>\n",
" <td>-0.933222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-04-06 14:21:00+01:00</th>\n",
" <td>AAL.L</td>\n",
" <td>Market Price</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>1332.0</td>\n",
" <td>16166.0</td>\n",
" <td>-0.001499</td>\n",
" <td>-0.004478</td>\n",
" <td>-0.002976</td>\n",
" <td>0.005988</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-04-06 14:22:00+01:00</th>\n",
" <td>AAL.L</td>\n",
" <td>Market Price</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>1330.0</td>\n",
" <td>19021.0</td>\n",
" <td>-0.001502</td>\n",
" <td>-0.001499</td>\n",
" <td>-0.004478</td>\n",
" <td>-0.002976</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-04-06 14:23:00+01:00</th>\n",
" <td>AAL.L</td>\n",
" <td>Market Price</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>1332.0</td>\n",
" <td>45759.0</td>\n",
" <td>0.001504</td>\n",
" <td>-0.001502</td>\n",
" <td>-0.001499</td>\n",
" <td>-0.004478</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-04-06 14:24:00+01:00</th>\n",
" <td>AAL.L</td>\n",
" <td>Market Price</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>1331.0</td>\n",
" <td>13481.0</td>\n",
" <td>-0.000751</td>\n",
" <td>0.001504</td>\n",
" <td>-0.001502</td>\n",
" <td>-0.001499</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-04-06 14:25:00+01:00</th>\n",
" <td>AAL.L</td>\n",
" <td>Market Price</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>1333.0</td>\n",
" <td>16301.0</td>\n",
" <td>0.001503</td>\n",
" <td>-0.000751</td>\n",
" <td>0.001504</td>\n",
" <td>-0.001502</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1048575 rows × 9 columns</p>\n",
"</div>"
],
"text/plain": [
" #RIC Domain Type Last \\\n",
"Date-Time \n",
"1996-03-14 15:37:00+00:00 AAF.L Market Price Intraday 1Min 299.5 \n",
"1996-03-20 14:51:00+00:00 AAF.L Market Price Intraday 1Min 20.0 \n",
"1996-05-14 13:49:00+01:00 AAF.L Market Price Intraday 1Min 30.0 \n",
"1996-10-16 16:04:00+01:00 AAF.L Market Price Intraday 1Min 25.0 \n",
"1996-10-23 14:59:00+01:00 AAF.L Market Price Intraday 1Min 30.0 \n",
"... ... ... ... ... \n",
"2009-04-06 14:21:00+01:00 AAL.L Market Price Intraday 1Min 1332.0 \n",
"2009-04-06 14:22:00+01:00 AAL.L Market Price Intraday 1Min 1330.0 \n",
"2009-04-06 14:23:00+01:00 AAL.L Market Price Intraday 1Min 1332.0 \n",
"2009-04-06 14:24:00+01:00 AAL.L Market Price Intraday 1Min 1331.0 \n",
"2009-04-06 14:25:00+01:00 AAL.L Market Price Intraday 1Min 1333.0 \n",
"\n",
" Volume cobit_returns cobit_lag1 cobit_lag2 \\\n",
"Date-Time \n",
"1996-03-14 15:37:00+00:00 25000.0 NaN NaN NaN \n",
"1996-03-20 14:51:00+00:00 100000.0 -0.933222 NaN NaN \n",
"1996-05-14 13:49:00+01:00 6000.0 0.500000 -0.933222 NaN \n",
"1996-10-16 16:04:00+01:00 50000.0 -0.166667 0.500000 -0.933222 \n",
"1996-10-23 14:59:00+01:00 23000.0 0.200000 -0.166667 0.500000 \n",
"... ... ... ... ... \n",
"2009-04-06 14:21:00+01:00 16166.0 -0.001499 -0.004478 -0.002976 \n",
"2009-04-06 14:22:00+01:00 19021.0 -0.001502 -0.001499 -0.004478 \n",
"2009-04-06 14:23:00+01:00 45759.0 0.001504 -0.001502 -0.001499 \n",
"2009-04-06 14:24:00+01:00 13481.0 -0.000751 0.001504 -0.001502 \n",
"2009-04-06 14:25:00+01:00 16301.0 0.001503 -0.000751 0.001504 \n",
"\n",
" cobit_lag3 \n",
"Date-Time \n",
"1996-03-14 15:37:00+00:00 NaN \n",
"1996-03-20 14:51:00+00:00 NaN \n",
"1996-05-14 13:49:00+01:00 NaN \n",
"1996-10-16 16:04:00+01:00 NaN \n",
"1996-10-23 14:59:00+01:00 -0.933222 \n",
"... ... \n",
"2009-04-06 14:21:00+01:00 0.005988 \n",
"2009-04-06 14:22:00+01:00 -0.002976 \n",
"2009-04-06 14:23:00+01:00 -0.004478 \n",
"2009-04-06 14:24:00+01:00 -0.001499 \n",
"2009-04-06 14:25:00+01:00 -0.001502 \n",
"\n",
"[1048575 rows x 9 columns]"
]
},
"execution_count": 167,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": 99,
"id": "dcc3863c",
"metadata": {},
"outputs": [],
"source": [
"data = data[[\"cobit_returns\",\"cobit_lag1\",\"cobit_lag2\",\"cobit_lag3\"]]"
]
},
{
"cell_type": "code",
"execution_count": 108,
"id": "b1f44743",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>cobit_returns</th>\n",
" <th>cobit_lag1</th>\n",
" <th>cobit_lag2</th>\n",
" <th>cobit_lag3</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Date-Time</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1996-03-14 15:37:00+00:00</th>\n",
" <td>-0.933222</td>\n",
" <td>-0.933222</td>\n",
" <td>-0.933222</td>\n",
" <td>-0.933222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1996-03-20 14:51:00+00:00</th>\n",
" <td>-0.933222</td>\n",
" <td>-0.933222</td>\n",
" <td>-0.933222</td>\n",
" <td>-0.933222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1996-05-14 13:49:00+01:00</th>\n",
" <td>0.500000</td>\n",
" <td>-0.933222</td>\n",
" <td>-0.933222</td>\n",
" <td>-0.933222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1996-10-16 16:04:00+01:00</th>\n",
" <td>-0.166667</td>\n",
" <td>0.500000</td>\n",
" <td>-0.933222</td>\n",
" <td>-0.933222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1996-10-23 14:59:00+01:00</th>\n",
" <td>0.200000</td>\n",
" <td>-0.166667</td>\n",
" <td>0.500000</td>\n",
" <td>-0.933222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-04-06 14:21:00+01:00</th>\n",
" <td>-0.001499</td>\n",
" <td>-0.004478</td>\n",
" <td>-0.002976</td>\n",
" <td>0.005988</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-04-06 14:22:00+01:00</th>\n",
" <td>-0.001502</td>\n",
" <td>-0.001499</td>\n",
" <td>-0.004478</td>\n",
" <td>-0.002976</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-04-06 14:23:00+01:00</th>\n",
" <td>0.001504</td>\n",
" <td>-0.001502</td>\n",
" <td>-0.001499</td>\n",
" <td>-0.004478</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-04-06 14:24:00+01:00</th>\n",
" <td>-0.000751</td>\n",
" <td>0.001504</td>\n",
" <td>-0.001502</td>\n",
" <td>-0.001499</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-04-06 14:25:00+01:00</th>\n",
" <td>0.001503</td>\n",
" <td>-0.000751</td>\n",
" <td>0.001504</td>\n",
" <td>-0.001502</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1048575 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" cobit_returns cobit_lag1 cobit_lag2 cobit_lag3\n",
"Date-Time \n",
"1996-03-14 15:37:00+00:00 -0.933222 -0.933222 -0.933222 -0.933222\n",
"1996-03-20 14:51:00+00:00 -0.933222 -0.933222 -0.933222 -0.933222\n",
"1996-05-14 13:49:00+01:00 0.500000 -0.933222 -0.933222 -0.933222\n",
"1996-10-16 16:04:00+01:00 -0.166667 0.500000 -0.933222 -0.933222\n",
"1996-10-23 14:59:00+01:00 0.200000 -0.166667 0.500000 -0.933222\n",
"... ... ... ... ...\n",
"2009-04-06 14:21:00+01:00 -0.001499 -0.004478 -0.002976 0.005988\n",
"2009-04-06 14:22:00+01:00 -0.001502 -0.001499 -0.004478 -0.002976\n",
"2009-04-06 14:23:00+01:00 0.001504 -0.001502 -0.001499 -0.004478\n",
"2009-04-06 14:24:00+01:00 -0.000751 0.001504 -0.001502 -0.001499\n",
"2009-04-06 14:25:00+01:00 0.001503 -0.000751 0.001504 -0.001502\n",
"\n",
"[1048575 rows x 4 columns]"
]
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": 109,
"id": "934a2acd",
"metadata": {},
"outputs": [],
"source": [
"data = pd.DataFrame(data)"
]
},
{
"cell_type": "code",
"execution_count": 110,
"id": "3f06a894",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>cobit_returns</th>\n",
" <th>cobit_lag1</th>\n",
" <th>cobit_lag2</th>\n",
" <th>cobit_lag3</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Date-Time</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1996-03-14 15:37:00+00:00</th>\n",
" <td>-0.933222</td>\n",
" <td>-0.933222</td>\n",
" <td>-0.933222</td>\n",
" <td>-0.933222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1996-03-20 14:51:00+00:00</th>\n",
" <td>-0.933222</td>\n",
" <td>-0.933222</td>\n",
" <td>-0.933222</td>\n",
" <td>-0.933222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1996-05-14 13:49:00+01:00</th>\n",
" <td>0.500000</td>\n",
" <td>-0.933222</td>\n",
" <td>-0.933222</td>\n",
" <td>-0.933222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1996-10-16 16:04:00+01:00</th>\n",
" <td>-0.166667</td>\n",
" <td>0.500000</td>\n",
" <td>-0.933222</td>\n",
" <td>-0.933222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1996-10-23 14:59:00+01:00</th>\n",
" <td>0.200000</td>\n",
" <td>-0.166667</td>\n",
" <td>0.500000</td>\n",
" <td>-0.933222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-04-06 14:21:00+01:00</th>\n",
" <td>-0.001499</td>\n",
" <td>-0.004478</td>\n",
" <td>-0.002976</td>\n",
" <td>0.005988</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-04-06 14:22:00+01:00</th>\n",
" <td>-0.001502</td>\n",
" <td>-0.001499</td>\n",
" <td>-0.004478</td>\n",
" <td>-0.002976</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-04-06 14:23:00+01:00</th>\n",
" <td>0.001504</td>\n",
" <td>-0.001502</td>\n",
" <td>-0.001499</td>\n",
" <td>-0.004478</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-04-06 14:24:00+01:00</th>\n",
" <td>-0.000751</td>\n",
" <td>0.001504</td>\n",
" <td>-0.001502</td>\n",
" <td>-0.001499</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-04-06 14:25:00+01:00</th>\n",
" <td>0.001503</td>\n",
" <td>-0.000751</td>\n",
" <td>0.001504</td>\n",
" <td>-0.001502</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1048575 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" cobit_returns cobit_lag1 cobit_lag2 cobit_lag3\n",
"Date-Time \n",
"1996-03-14 15:37:00+00:00 -0.933222 -0.933222 -0.933222 -0.933222\n",
"1996-03-20 14:51:00+00:00 -0.933222 -0.933222 -0.933222 -0.933222\n",
"1996-05-14 13:49:00+01:00 0.500000 -0.933222 -0.933222 -0.933222\n",
"1996-10-16 16:04:00+01:00 -0.166667 0.500000 -0.933222 -0.933222\n",
"1996-10-23 14:59:00+01:00 0.200000 -0.166667 0.500000 -0.933222\n",
"... ... ... ... ...\n",
"2009-04-06 14:21:00+01:00 -0.001499 -0.004478 -0.002976 0.005988\n",
"2009-04-06 14:22:00+01:00 -0.001502 -0.001499 -0.004478 -0.002976\n",
"2009-04-06 14:23:00+01:00 0.001504 -0.001502 -0.001499 -0.004478\n",
"2009-04-06 14:24:00+01:00 -0.000751 0.001504 -0.001502 -0.001499\n",
"2009-04-06 14:25:00+01:00 0.001503 -0.000751 0.001504 -0.001502\n",
"\n",
"[1048575 rows x 4 columns]"
]
},
"execution_count": 110,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": 101,
"id": "024fd39a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"cobit_returns 1\n",
"cobit_lag1 2\n",
"cobit_lag2 3\n",
"cobit_lag3 4\n",
"dtype: int64"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check for invalid values\n",
"data.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 102,
"id": "2a50cd93",
"metadata": {},
"outputs": [],
"source": [
"data = data.fillna(method='ffill').fillna(method='bfill')\n"
]
},
{
"cell_type": "code",
"execution_count": 103,
"id": "ab0f28c5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cobit_returns 0\n",
"cobit_lag1 0\n",
"cobit_lag2 0\n",
"cobit_lag3 0\n",
"dtype: int64\n"
]
}
],
"source": [
"# Check for invalid values\n",
"print(np.isnan(data).sum())"
]
},
{
"cell_type": "code",
"execution_count": 104,
"id": "3514c850",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"786431"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"##Split data into train and test\n",
"# Total dataset length\n",
"dataset_length = data.shape[0]\n",
"# Training dataset length\n",
"split = int(dataset_length * 0.75)\n",
"split\n"
]
},
{
"cell_type": "code",
"execution_count": 106,
"id": "dc5e63d9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(786431, 4)\n",
"(262144, 4)\n"
]
}
],
"source": [
"# Splitiing the X and y into train and test datasets\n",
"train = data[:split]\n",
"test = data[split:]\n",
"# Print the size of the train and test dataset\n",
"print(train.shape)\n",
"print(test.shape)"
]
},
{
"cell_type": "code",
"execution_count": 111,
"id": "a9f6c983",
"metadata": {},
"outputs": [],
"source": [
"#create x and y variables for both test and train data\n",
"x_train = train.drop('cobit_returns', axis=1)\n",
"y_train = train['cobit_returns']\n",
"x_valid = test.drop('cobit_returns', axis=1)\n",
"y_valid = test['cobit_returns']"
]
},
{
"cell_type": "code",
"execution_count": 113,
"id": "a6f4cce0",
"metadata": {},
"outputs": [],
"source": [
"# Fit the OLS model on the training data\n",
"model = OLS(y_train, x_train).fit()"
]
},
{
"cell_type": "code",
"execution_count": 114,
"id": "851f4b1d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" OLS Regression Results \n",
"=======================================================================================\n",
"Dep. Variable: cobit_returns R-squared (uncentered): 0.000\n",
"Model: OLS Adj. R-squared (uncentered): -0.000\n",
"Method: Least Squares F-statistic: 1.103e-07\n",
"Date: Thu, 13 Apr 2023 Prob (F-statistic): 1.00\n",
"Time: 23:10:19 Log-Likelihood: -7.4506e+06\n",
"No. Observations: 786431 AIC: 1.490e+07\n",
"Df Residuals: 786428 BIC: 1.490e+07\n",
"Df Model: 3 \n",
"Covariance Type: nonrobust \n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"cobit_lag1 -6.485e-07 0.001 -0.001 1.000 -0.002 0.002\n",
"cobit_lag2 2.135e-10 0.001 1.89e-07 1.000 -0.002 0.002\n",
"cobit_lag3 9.858e-10 0.001 8.74e-07 1.000 -0.002 0.002\n",
"================================================================================\n",
"Omnibus: 5291579.778 Durbin-Watson: 2.000\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 7898864025013409.000\n",
"Skew: 666.114 Prob(JB): 0.00\n",
"Kurtosis: 490974.213 Cond. No. 1.00\n",
"================================================================================\n",
"\n",
"Notes:\n",
"[1] R² is computed without centering (uncentered) since the model does not contain a constant.\n",
"[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
]
}
],
"source": [
"# Print a summary of the model\n",
"print(model.summary())"
]
},
{
"cell_type": "markdown",
"id": "376b0f72",
"metadata": {},
"source": [
"# LASSO Model for predictions"
]
},
{
"cell_type": "code",
"execution_count": 143,
"id": "38341102",
"metadata": {},
"outputs": [],
"source": [
"# Importing the dataset\n",
"data = pd.read_csv('ftse100.csv')"
]
},
{
"cell_type": "code",
"execution_count": 144,
"id": "5be2e40c",
"metadata": {},
"outputs": [],
"source": [
"# calculate the returns\n",
"data['return'] = data['Last'].pct_change()"
]
},
{
"cell_type": "code",
"execution_count": 145,
"id": "57a55326",
"metadata": {},
"outputs": [],
"source": [
"data.dropna(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 146,
"id": "6e3e4804",
"metadata": {},
"outputs": [],
"source": [
"# split data into train and test sets\n",
"train_size = int(len(data) * 0.8)\n",
"train = data.iloc[:train_size]\n",
"test = data.iloc[train_size:]"
]
},
{
"cell_type": "code",
"execution_count": 147,
"id": "931378ec",
"metadata": {},
"outputs": [],
"source": [
"# create rolling window features for train set\n",
"window_size = 30 # 30 minutes\n",
"X_train = []\n",
"y_train = []\n",
"for i in range(window_size, len(train)):\n",
" X_train.append(train.iloc[i-window_size:i]['return'].values)\n",
" y_train.append(train.iloc[i]['Last'])\n",
"X_train, y_train = np.array(X_train), np.array(y_train)"
]
},
{
"cell_type": "code",
"execution_count": 149,
"id": "56ad86c4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Lasso(alpha=0.1)"
]
},
"execution_count": 149,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Create the Lasso model\n",
"lasso = Lasso(alpha=0.1)\n",
"\n",
"# Train the model\n",
"lasso.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 150,
"id": "89bd1ab1",
"metadata": {},
"outputs": [],
"source": [
"# create rolling window features for test set\n",
"window_size = 30 # 3 minutes\n",
"X_test = []\n",
"y_test = []\n",
"for i in range(window_size, len(test)):\n",
" X_test.append(test.iloc[i-window_size:i]['return'].values)\n",
" y_test.append(test.iloc[i]['Last'])\n",
"X_test, y_test = np.array(X_test), np.array(y_test)"
]
},
{
"cell_type": "code",
"execution_count": 153,
"id": "305a576c",
"metadata": {},
"outputs": [],
"source": [
"# make predictions for test set\n",
"y_pred = lasso.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 152,
"id": "c95a0323",
"metadata": {},
"outputs": [],
"source": [
"# calculate the mean absolute error\n",
"mae = np.mean(np.abs(y_test - y_pred))"
]
},
{
"cell_type": "code",
"execution_count": 131,
"id": "20ff5901",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1132.848751805076"
]
},
"execution_count": 131,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mae"
]
},
{
"cell_type": "code",
"execution_count": 160,
"id": "e615c7e1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mean absolute error: 1132.87\n"
]
}
],
"source": [
"print(f'Mean absolute error: {mae:.2f}')"
]
},
{
"cell_type": "code",
"execution_count": 156,
"id": "d63a9b72",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import r2_score"
]
},
{
"cell_type": "code",
"execution_count": 157,
"id": "20e82abc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Lasso(alpha=0.1)\n",
"r^2 on test data : -0.410549\n"
]
}
],
"source": [
"r2_score_lasso = r2_score(y_test,y_pred)\n",
"print(lasso)\n",
"print(\"r^2 on test data : %f\" % r2_score_lasso)"
]
},
{
"cell_type": "markdown",
"id": "3e4314d4",
"metadata": {},
"source": [
"## another way of doing LESSO"
]
},
{
"cell_type": "code",
"execution_count": 171,
"id": "2e877a6c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Lasso(alpha=0.1)"
]
},
"execution_count": 171,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Train the model\n",
"lasso.fit(X_train, y_train)\n"
]
},
{
"cell_type": "code",
"execution_count": 174,
"id": "b688bc93",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Lasso(alpha=0.1)"
]
},
"execution_count": 174,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Fit the Lasso model\n",
"lasso_model = Lasso(alpha=0.1)\n",
"lasso_model.fit(X_train, y_train)\n"
]
},
{
"cell_type": "code",
"execution_count": 175,
"id": "285f6dfa",
"metadata": {},
"outputs": [],
"source": [
"# Get the estimated coefficients\n",
"coefficients = lasso_model.coef_"
]
},
{
"cell_type": "code",
"execution_count": 177,
"id": "b3709b52",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.00000000e+00, 5.27432406e-05, 5.21421806e-05, 5.37356777e-05])"
]
},
"execution_count": 177,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"coefficients"
]
},
{
"cell_type": "code",
"execution_count": 178,
"id": "caa16594",
"metadata": {},
"outputs": [],
"source": [
"# Get the feature names from the original dataframe\n",
"feature_names = df.columns[:-1]\n",
"\n",
"# Pair feature names with their coefficients\n",
"coef_pairs = zip(feature_names, lasso_model.coef_)"
]
},
{
"cell_type": "code",
"execution_count": 179,
"id": "20cd034b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"#RIC: 0.0\n",
"Domain: 5.2743240560278315e-05\n",
"Date-Time: 5.2142180572714973e-05\n",
"Type: 5.3735677737564e-05\n"
]
}
],
"source": [
"# Print the coefficients\n",
"for feature, coef in coef_pairs:\n",
" print(f\"{feature}: {coef}\")"
]
},
{
"cell_type": "code",
"execution_count": 180,
"id": "3bd0c430",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0.00000000e+00 5.27432406e-05 5.21421806e-05 5.37356777e-05]\n"
]
}
],
"source": [
"# Get estimated coefficients\n",
"coef = lasso.coef_\n",
"print(coef)"
]
},
{
"cell_type": "markdown",
"id": "18ecdd68",
"metadata": {},
"source": [
"# Out of sample forecast-Benchmark-Lasso"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "a624d03d",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.linear_model import LinearRegression, LassoCV\n",
"from sklearn.metrics import r2_score"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "5a2f2436",
"metadata": {},
"outputs": [],
"source": [
"# Importing the dataset\n",
"data = pd.read_csv('ftse100.csv')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "e04e4311",
"metadata": {},
"outputs": [],
"source": [
"# Define the time window and lag parameters\n",
"est_window = 30 # minutes\n",
"pred_window = 3 # minutes\n",
"lags = 3 # number of lags to include in OLS regression"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "47620047",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>#RIC</th>\n",
" <th>Domain</th>\n",
" <th>Date-Time</th>\n",
" <th>Type</th>\n",
" <th>Last</th>\n",
" <th>Volume</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-03-14T15:37:00.000000000Z</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>299.5</td>\n",
" <td>25000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-03-20T14:51:00.000000000Z</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>20.0</td>\n",
" <td>100000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-05-14T13:49:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>6000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-10-16T16:04:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>25.0</td>\n",
" <td>50000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-10-23T14:59:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>23000.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" #RIC Domain Date-Time Type \\\n",
"0 AAF.L Market Price 1996-03-14T15:37:00.000000000Z Intraday 1Min \n",
"1 AAF.L Market Price 1996-03-20T14:51:00.000000000Z Intraday 1Min \n",
"2 AAF.L Market Price 1996-05-14T13:49:00.000000000+01 Intraday 1Min \n",
"3 AAF.L Market Price 1996-10-16T16:04:00.000000000+01 Intraday 1Min \n",
"4 AAF.L Market Price 1996-10-23T14:59:00.000000000+01 Intraday 1Min \n",
"\n",
" Last Volume \n",
"0 299.5 25000.0 \n",
"1 20.0 100000.0 \n",
"2 30.0 6000.0 \n",
"3 25.0 50000.0 \n",
"4 30.0 23000.0 "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "85b54311",
"metadata": {},
"outputs": [],
"source": [
"# Calculate the daily returns\n",
"data['returns'] = data['Last'].pct_change()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "951f1cf0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>#RIC</th>\n",
" <th>Domain</th>\n",
" <th>Date-Time</th>\n",
" <th>Type</th>\n",
" <th>Last</th>\n",
" <th>Volume</th>\n",
" <th>returns</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-03-14T15:37:00.000000000Z</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>299.5</td>\n",
" <td>25000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-03-20T14:51:00.000000000Z</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>20.0</td>\n",
" <td>100000.0</td>\n",
" <td>-0.933222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-05-14T13:49:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>6000.0</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-10-16T16:04:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>25.0</td>\n",
" <td>50000.0</td>\n",
" <td>-0.166667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-10-23T14:59:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>23000.0</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" #RIC Domain Date-Time Type \\\n",
"0 AAF.L Market Price 1996-03-14T15:37:00.000000000Z Intraday 1Min \n",
"1 AAF.L Market Price 1996-03-20T14:51:00.000000000Z Intraday 1Min \n",
"2 AAF.L Market Price 1996-05-14T13:49:00.000000000+01 Intraday 1Min \n",
"3 AAF.L Market Price 1996-10-16T16:04:00.000000000+01 Intraday 1Min \n",
"4 AAF.L Market Price 1996-10-23T14:59:00.000000000+01 Intraday 1Min \n",
"\n",
" Last Volume returns \n",
"0 299.5 25000.0 NaN \n",
"1 20.0 100000.0 -0.933222 \n",
"2 30.0 6000.0 0.500000 \n",
"3 25.0 50000.0 -0.166667 \n",
"4 30.0 23000.0 0.200000 "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "6af6ebda",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"786431"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"##Split data into train and test\n",
"# Total dataset length\n",
"dataset_length = data.shape[0]\n",
"# Training dataset length\n",
"split = int(dataset_length * 0.75)\n",
"split"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "1bf2a7c0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(786431, 7)\n",
"(262144, 7)\n"
]
}
],
"source": [
"# Splitiing the X and y into train and test datasets\n",
"train = data[:split]\n",
"test = data[split:]\n",
"# Print the size of the train and test dataset\n",
"print(train.shape)\n",
"print(test.shape)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "fdcbfe4e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>#RIC</th>\n",
" <th>Domain</th>\n",
" <th>Date-Time</th>\n",
" <th>Type</th>\n",
" <th>Last</th>\n",
" <th>Volume</th>\n",
" <th>returns</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-03-14T15:37:00.000000000Z</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>299.5</td>\n",
" <td>25000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-03-20T14:51:00.000000000Z</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>20.0</td>\n",
" <td>100000.0</td>\n",
" <td>-0.933222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-05-14T13:49:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>6000.0</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-10-16T16:04:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>25.0</td>\n",
" <td>50000.0</td>\n",
" <td>-0.166667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AAF.L</td>\n",
" <td>Market Price</td>\n",
" <td>1996-10-23T14:59:00.000000000+01</td>\n",
" <td>Intraday 1Min</td>\n",
" <td>30.0</td>\n",
" <td>23000.0</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" #RIC Domain Date-Time Type \\\n",
"0 AAF.L Market Price 1996-03-14T15:37:00.000000000Z Intraday 1Min \n",
"1 AAF.L Market Price 1996-03-20T14:51:00.000000000Z Intraday 1Min \n",
"2 AAF.L Market Price 1996-05-14T13:49:00.000000000+01 Intraday 1Min \n",
"3 AAF.L Market Price 1996-10-16T16:04:00.000000000+01 Intraday 1Min \n",
"4 AAF.L Market Price 1996-10-23T14:59:00.000000000+01 Intraday 1Min \n",
"\n",
" Last Volume returns \n",
"0 299.5 25000.0 NaN \n",
"1 20.0 100000.0 -0.933222 \n",
"2 30.0 6000.0 0.500000 \n",
"3 25.0 50000.0 -0.166667 \n",
"4 30.0 23000.0 0.200000 "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "68e70619",
"metadata": {},
"outputs": [],
"source": [
"# Create the OLS predictors matrix for the training data\n",
"ols_predictors = []\n",
"for i in range(lags):\n",
" ols_predictors.append(train['returns'].shift(i+1))\n",
"ols_predictors = pd.concat(ols_predictors, axis=1)\n",
"ols_predictors = ols_predictors.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "df7f15f4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>returns</th>\n",
" <th>returns</th>\n",
" <th>returns</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-0.166667</td>\n",
" <td>0.500000</td>\n",
" <td>-0.933222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.200000</td>\n",
" <td>-0.166667</td>\n",
" <td>0.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>-0.100000</td>\n",
" <td>0.200000</td>\n",
" <td>-0.166667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>0.018519</td>\n",
" <td>-0.100000</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>-0.054545</td>\n",
" <td>0.018519</td>\n",
" <td>-0.100000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>786426</th>\n",
" <td>0.002088</td>\n",
" <td>-0.001584</td>\n",
" <td>0.000334</td>\n",
" </tr>\n",
" <tr>\n",
" <th>786427</th>\n",
" <td>-0.000417</td>\n",
" <td>0.002088</td>\n",
" <td>-0.001584</td>\n",
" </tr>\n",
" <tr>\n",
" <th>786428</th>\n",
" <td>-0.000417</td>\n",
" <td>-0.000417</td>\n",
" <td>0.002088</td>\n",
" </tr>\n",
" <tr>\n",
" <th>786429</th>\n",
" <td>-0.000417</td>\n",
" <td>-0.000417</td>\n",
" <td>-0.000417</td>\n",
" </tr>\n",
" <tr>\n",
" <th>786430</th>\n",
" <td>-0.002503</td>\n",
" <td>-0.000417</td>\n",
" <td>-0.000417</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>786427 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" returns returns returns\n",
"4 -0.166667 0.500000 -0.933222\n",
"5 0.200000 -0.166667 0.500000\n",
"6 -0.100000 0.200000 -0.166667\n",
"7 0.018519 -0.100000 0.200000\n",
"8 -0.054545 0.018519 -0.100000\n",
"... ... ... ...\n",
"786426 0.002088 -0.001584 0.000334\n",
"786427 -0.000417 0.002088 -0.001584\n",
"786428 -0.000417 -0.000417 0.002088\n",
"786429 -0.000417 -0.000417 -0.000417\n",
"786430 -0.002503 -0.000417 -0.000417\n",
"\n",
"[786427 rows x 3 columns]"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ols_predictors"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "dce3d141",
"metadata": {},
"outputs": [],
"source": [
"# Create the OLS response vector for the training data\n",
"ols_response = train['returns'].shift(-4)\n",
"ols_response = ols_response.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "e05d801f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 0.200000\n",
"1 -0.100000\n",
"2 0.018519\n",
"3 -0.054545\n",
"4 -0.038462\n",
" ... \n",
"786422 -0.000417\n",
"786423 -0.000417\n",
"786424 -0.000417\n",
"786425 -0.002503\n",
"786426 0.000000\n",
"Name: returns, Length: 786427, dtype: float64"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ols_response"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "84aaa7fb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LinearRegression()"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Fit the OLS regression model\n",
"ols_model = LinearRegression()\n",
"ols_model.fit(ols_predictors, ols_response)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "122616f9",
"metadata": {},
"outputs": [],
"source": [
"# Create the LASSO predictors matrix for the prediction data\n",
"lasso_predictors = []\n",
"for i in range(lags):\n",
" lasso_predictors.append(test['returns'].shift(i+1))\n",
"lasso_predictors = pd.concat(lasso_predictors, axis=1)\n",
"lasso_predictors = lasso_predictors.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "ea630011",
"metadata": {},
"outputs": [],
"source": [
"# Apply the OLS coefficients to the LASSO predictors matrix\n",
"lasso_predictors = lasso_predictors.mul(ols_model.coef_, axis=1).sum(axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "0dd5dee7",
"metadata": {},
"outputs": [],
"source": [
"#Normalize the LASSO predictors to have mean zero and unit variance\n",
"lasso_predictors = (lasso_predictors - lasso_predictors.mean()) / lasso_predictors.std()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "38d1cd79",
"metadata": {},
"outputs": [],
"source": [
"# Create the LASSO response vector for the prediction data\n",
"lasso_response = test['returns'].shift(-3)\n",
"lasso_response = lasso_response.dropna()\n"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "e89937d7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LassoCV(cv=5)"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Fit the LASSO regression model\n",
"lasso_model = LassoCV(cv=5)\n",
"lasso_model.fit(lasso_predictors.values.reshape(-1, 1), lasso_response)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "ee2f77e1",
"metadata": {},
"outputs": [],
"source": [
"# Make the 1-minute-ahead return forecast for each stock\n",
"forecasts = {}\n",
"for i in range(len(test)):\n",
" # Extract the relevant data for the current prediction\n",
" train_data = test.iloc[i-lags:i]\n",
" pred_data_point = test.iloc[i][['returns']]"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "285de7ac",
"metadata": {},
"outputs": [],
"source": [
"# Make the 1-minute-ahead return forecast for each stock\n",
"forecasts = {}\n",
"for i in range(len(test)):\n",
" lasso_predictor = test['returns'].iloc[-lags:].mul(ols_model.coef_).sum()\n",
" lasso_predictor = (lasso_predictor - lasso_predictors.mean()) / lasso_predictors.std()\n",
" forecast = ols_model.intercept_ + lasso_model.predict([[lasso_predictor]])\n",
" forecasts = forecast[0]"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "604615ed",
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "Found input variables with inconsistent numbers of samples: [262142, 1]",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_2443267/759002749.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0my_true\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'returns'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshift\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdropna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0my_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mforecasts\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mr2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mr2_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Out-of-sample R-squared: {r2:.4f}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/ohpc/pub/apps/anaconda/lib/python3.9/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;31m# extra_args > 0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/ohpc/pub/apps/anaconda/lib/python3.9/site-packages/sklearn/metrics/_regression.py\u001b[0m in \u001b[0;36mr2_score\u001b[0;34m(y_true, y_pred, sample_weight, multioutput)\u001b[0m\n\u001b[1;32m 674\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m3.0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 675\u001b[0m \"\"\"\n\u001b[0;32m--> 676\u001b[0;31m y_type, y_true, y_pred, multioutput = _check_reg_targets(\n\u001b[0m\u001b[1;32m 677\u001b[0m y_true, y_pred, multioutput)\n\u001b[1;32m 678\u001b[0m \u001b[0mcheck_consistent_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/ohpc/pub/apps/anaconda/lib/python3.9/site-packages/sklearn/metrics/_regression.py\u001b[0m in \u001b[0;36m_check_reg_targets\u001b[0;34m(y_true, y_pred, multioutput, dtype)\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mdtype\u001b[0m \u001b[0margument\u001b[0m \u001b[0mpassed\u001b[0m \u001b[0mto\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 87\u001b[0m \"\"\"\n\u001b[0;32m---> 88\u001b[0;31m \u001b[0mcheck_consistent_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 89\u001b[0m \u001b[0my_true\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0my_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_pred\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/ohpc/pub/apps/anaconda/lib/python3.9/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_consistent_length\u001b[0;34m(*arrays)\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[0muniques\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munique\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlengths\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 318\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0muniques\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 319\u001b[0;31m raise ValueError(\"Found input variables with inconsistent numbers of\"\n\u001b[0m\u001b[1;32m 320\u001b[0m \" samples: %r\" % [int(l) for l in lengths])\n\u001b[1;32m 321\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: Found input variables with inconsistent numbers of samples: [262142, 1]"
]
}
],
"source": [
"# Evaluate the out-of-sample performance\n",
"y_true = test['returns'].shift(-2).dropna()\n",
"y_pred = pd.Series(forecasts)\n",
"r2 = r2_score(y_true, y_pred)\n",
"print(f\"Out-of-sample R-squared: {r2:.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "f5ce5f69",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 6.410831\n",
"dtype: float64"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_pred"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2677ec0d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment