Last active
October 4, 2024 05:52
-
-
Save databento-bot/7c29d292bd19a280b222b1319bc75ddc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Building high-frequency trading signals in Python with Databento and sklearn | |
# | |
# This is a simple example that demonstrates how to build high-frequency trading signals in Python, | |
# using order book and market depth data from [Databento](https://databento.com) together with | |
# machine learning models from [sklearn](https://scikit-learn.org/). | |
import databento as db | |
import numpy as np | |
import pandas as pd | |
import plotly | |
import plotly.express as px | |
import plotly.io as pio | |
from sklearn.linear_model import LinearRegression | |
client = db.Historical('YOUR_API_KEY') | |
# Get 10 levels of ES lead month | |
data = client.timeseries.get_range( | |
dataset='GLBX.MDP3', | |
schema='mbp-10', | |
start="2023-12-06T14:30", | |
end="2023-12-06T20:30", | |
symbols=['ES.n.0'], | |
stype_in='continuous', | |
) | |
df = data.to_df() | |
# Constructing our target vector | |
# Filter out trades only | |
df = df[df.action == 'T'] | |
# Get midprice returns with a forward markout of 500 trades | |
df['mid'] = (df['bid_px_00'] + df['ask_px_00'])/2 | |
df['ret_500t'] = df['mid'].shift(-500) - df['mid'] | |
df = df.dropna() | |
# Constructing our features | |
# Depth imbalance on top level ('skew') | |
df['skew'] = np.log(df.bid_sz_00) - np.log(df.ask_sz_00) | |
# Order imbalance on top ten levels ('imbalance') | |
df['imbalance'] = np.log(df[list(df.filter(regex='bid_ct_0[0-9]'))].sum(axis=1)) - \ | |
np.log(df[list(df.filter(regex='ask_ct_0[0-9]'))].sum(axis=1)) | |
# Splitting in-sample and out-of-sample | |
split = int(0.66 * len(df)) | |
split -= split % 100 | |
df_in = df.iloc[:split] | |
df_out = df.iloc[split:] | |
# Constructing our signal | |
corr = df_in[['skew', 'imbalance', 'ret_500t']].corr() | |
print(corr.where(np.triu(np.ones(corr.shape)).astype(bool))) | |
reg = LinearRegression(fit_intercept=False, positive=True) | |
reg.fit(df_in[['skew']], df_in['ret_500t']) | |
pred_skew = reg.predict(df_out[['skew']]) | |
reg.fit(df_in[['imbalance']], df_in['ret_500t']) | |
pred_imbalance = reg.predict(df_out[['imbalance']]) | |
reg.fit(df_in[['skew', 'imbalance']], df_in['ret_500t']) | |
pred_combined = reg.predict(df_out[['skew', 'imbalance']]) | |
# Uncomment these lines to use gradient-boosted trees | |
# from sklearn.ensemble import HistGradientBoostingRegressor | |
# reg = HistGradientBoostingRegressor() | |
# reg.fit(df_in[['skew', 'imbalance']], df_in['ret_500t']) | |
# pred_combined = reg.predict(df_out[['skew', 'imbalance']]) | |
# Results | |
# pio.renderers.default = 'notebook' | |
# pio.renderers.default = 'iframe' | |
pct = np.arange(0, 100, step=100/len(df_out)) | |
def get_cumulative_markout_pnl(pred): | |
df_pnl = pd.DataFrame({'pred': pred, 'ret_500t': df_out['ret_500t'].values}) | |
df_pnl.loc[df_pnl['pred'] < 0, 'ret_500t'] *= -1 | |
df_pnl = df_pnl.sort_values(by='pred') | |
return df_pnl['ret_500t'].cumsum().values | |
results = pd.DataFrame({ | |
'pct': pct, | |
'skew': get_cumulative_markout_pnl(pred_skew), | |
'imbalance': get_cumulative_markout_pnl(pred_imbalance), | |
'combined': get_cumulative_markout_pnl(pred_combined), | |
}) | |
fig = px.line( | |
results, x='pct', y=['skew', 'imbalance', 'combined'], | |
title='Forecasting with book skew vs. imbalance', | |
labels={'pct': 'Predictor value (percentile)'}, | |
) | |
fig.update_yaxes(title_text='Cumulative return') | |
fig.update_layout(legend=dict( | |
orientation="h", | |
yanchor="bottom", | |
y=1.02, | |
xanchor="right", | |
x=1 | |
)) | |
fig.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment