Skip to content

Instantly share code, notes, and snippets.

@databento-bot
Last active October 4, 2024 05:52
Show Gist options
  • Save databento-bot/7c29d292bd19a280b222b1319bc75ddc to your computer and use it in GitHub Desktop.
Save databento-bot/7c29d292bd19a280b222b1319bc75ddc to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# Building high-frequency trading signals in Python with Databento and sklearn
#
# This is a simple example that demonstrates how to build high-frequency trading signals in Python,
# using order book and market depth data from [Databento](https://databento.com) together with
# machine learning models from [sklearn](https://scikit-learn.org/).
import databento as db
import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import plotly.io as pio
from sklearn.linear_model import LinearRegression
client = db.Historical('YOUR_API_KEY')
# Get 10 levels of ES lead month
data = client.timeseries.get_range(
dataset='GLBX.MDP3',
schema='mbp-10',
start="2023-12-06T14:30",
end="2023-12-06T20:30",
symbols=['ES.n.0'],
stype_in='continuous',
)
df = data.to_df()
# Constructing our target vector
# Filter out trades only
df = df[df.action == 'T']
# Get midprice returns with a forward markout of 500 trades
df['mid'] = (df['bid_px_00'] + df['ask_px_00'])/2
df['ret_500t'] = df['mid'].shift(-500) - df['mid']
df = df.dropna()
# Constructing our features
# Depth imbalance on top level ('skew')
df['skew'] = np.log(df.bid_sz_00) - np.log(df.ask_sz_00)
# Order imbalance on top ten levels ('imbalance')
df['imbalance'] = np.log(df[list(df.filter(regex='bid_ct_0[0-9]'))].sum(axis=1)) - \
np.log(df[list(df.filter(regex='ask_ct_0[0-9]'))].sum(axis=1))
# Splitting in-sample and out-of-sample
split = int(0.66 * len(df))
split -= split % 100
df_in = df.iloc[:split]
df_out = df.iloc[split:]
# Constructing our signal
corr = df_in[['skew', 'imbalance', 'ret_500t']].corr()
print(corr.where(np.triu(np.ones(corr.shape)).astype(bool)))
reg = LinearRegression(fit_intercept=False, positive=True)
reg.fit(df_in[['skew']], df_in['ret_500t'])
pred_skew = reg.predict(df_out[['skew']])
reg.fit(df_in[['imbalance']], df_in['ret_500t'])
pred_imbalance = reg.predict(df_out[['imbalance']])
reg.fit(df_in[['skew', 'imbalance']], df_in['ret_500t'])
pred_combined = reg.predict(df_out[['skew', 'imbalance']])
# Uncomment these lines to use gradient-boosted trees
# from sklearn.ensemble import HistGradientBoostingRegressor
# reg = HistGradientBoostingRegressor()
# reg.fit(df_in[['skew', 'imbalance']], df_in['ret_500t'])
# pred_combined = reg.predict(df_out[['skew', 'imbalance']])
# Results
# pio.renderers.default = 'notebook'
# pio.renderers.default = 'iframe'
pct = np.arange(0, 100, step=100/len(df_out))
def get_cumulative_markout_pnl(pred):
df_pnl = pd.DataFrame({'pred': pred, 'ret_500t': df_out['ret_500t'].values})
df_pnl.loc[df_pnl['pred'] < 0, 'ret_500t'] *= -1
df_pnl = df_pnl.sort_values(by='pred')
return df_pnl['ret_500t'].cumsum().values
results = pd.DataFrame({
'pct': pct,
'skew': get_cumulative_markout_pnl(pred_skew),
'imbalance': get_cumulative_markout_pnl(pred_imbalance),
'combined': get_cumulative_markout_pnl(pred_combined),
})
fig = px.line(
results, x='pct', y=['skew', 'imbalance', 'combined'],
title='Forecasting with book skew vs. imbalance',
labels={'pct': 'Predictor value (percentile)'},
)
fig.update_yaxes(title_text='Cumulative return')
fig.update_layout(legend=dict(
orientation="h",
yanchor="bottom",
y=1.02,
xanchor="right",
x=1
))
fig.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment