databento-bot/databento-sklearn-ml-example.py

## databento-sklearn-ml-example.py
#!/usr/bin/env python

# Building high-frequency trading signals in Python with Databento and sklearn
#
# This is a simple example that demonstrates how to build high-frequency trading signals in Python,
# using order book and market depth data from [Databento](https://databento.com) together with
# machine learning models from [sklearn](https://scikit-learn.org/).

import databento as db
import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import plotly.io as pio
from sklearn.linear_model import LinearRegression


client = db.Historical('YOUR_API_KEY')

# Get 10 levels of ES lead month
data = client.timeseries.get_range(
    dataset='GLBX.MDP3',
    schema='mbp-10',
    start="2023-12-06T14:30",
    end="2023-12-06T20:30",
    symbols=['ES.n.0'],
    stype_in='continuous',
)
df = data.to_df()


# Constructing our target vector
# Filter out trades only
df = df[df.action == 'T']

# Get midprice returns with a forward markout of 500 trades
df['mid'] = (df['bid_px_00'] + df['ask_px_00'])/2
df['ret_500t'] = df['mid'].shift(-500) - df['mid']

df = df.dropna()

# Constructing our features
# Depth imbalance on top level ('skew')
df['skew'] = np.log(df.bid_sz_00) - np.log(df.ask_sz_00)

# Order imbalance on top ten levels ('imbalance')
df['imbalance'] = np.log(df[list(df.filter(regex='bid_ct_0[0-9]'))].sum(axis=1)) - \
    np.log(df[list(df.filter(regex='ask_ct_0[0-9]'))].sum(axis=1))

# Splitting in-sample and out-of-sample
split = int(0.66 * len(df))
split -= split % 100
df_in = df.iloc[:split]
df_out = df.iloc[split:]

# Constructing our signal
corr = df_in[['skew', 'imbalance', 'ret_500t']].corr()
print(corr.where(np.triu(np.ones(corr.shape)).astype(bool)))

reg = LinearRegression(fit_intercept=False, positive=True)

reg.fit(df_in[['skew']], df_in['ret_500t'])
pred_skew = reg.predict(df_out[['skew']])

reg.fit(df_in[['imbalance']], df_in['ret_500t'])
pred_imbalance = reg.predict(df_out[['imbalance']])

reg.fit(df_in[['skew', 'imbalance']], df_in['ret_500t'])
pred_combined = reg.predict(df_out[['skew', 'imbalance']])

# Uncomment these lines to use gradient-boosted trees
# from sklearn.ensemble import HistGradientBoostingRegressor
# reg = HistGradientBoostingRegressor()
# reg.fit(df_in[['skew', 'imbalance']], df_in['ret_500t'])
# pred_combined = reg.predict(df_out[['skew', 'imbalance']])

# Results
# pio.renderers.default = 'notebook'
# pio.renderers.default = 'iframe'

pct = np.arange(0, 100, step=100/len(df_out))

def get_cumulative_markout_pnl(pred):
    df_pnl = pd.DataFrame({'pred': pred, 'ret_500t': df_out['ret_500t'].values})
    df_pnl.loc[df_pnl['pred'] < 0, 'ret_500t'] *= -1
    df_pnl = df_pnl.sort_values(by='pred')
    return df_pnl['ret_500t'].cumsum().values

results = pd.DataFrame({
    'pct': pct,
    'skew': get_cumulative_markout_pnl(pred_skew),
    'imbalance': get_cumulative_markout_pnl(pred_imbalance),
    'combined': get_cumulative_markout_pnl(pred_combined),
})

fig = px.line(
    results, x='pct', y=['skew', 'imbalance', 'combined'],
    title='Forecasting with book skew vs. imbalance',
    labels={'pct': 'Predictor value (percentile)'},
)

fig.update_yaxes(title_text='Cumulative return')

fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))

fig.show()
	#!/usr/bin/env python

	# Building high-frequency trading signals in Python with Databento and sklearn
	#
	# This is a simple example that demonstrates how to build high-frequency trading signals in Python,
	# using order book and market depth data from [Databento](https://databento.com) together with
	# machine learning models from [sklearn](https://scikit-learn.org/).

	import databento as db
	import numpy as np
	import pandas as pd
	import plotly
	import plotly.express as px
	import plotly.io as pio
	from sklearn.linear_model import LinearRegression


	client = db.Historical('YOUR_API_KEY')

	# Get 10 levels of ES lead month
	data = client.timeseries.get_range(
	dataset='GLBX.MDP3',
	schema='mbp-10',
	start="2023-12-06T14:30",
	end="2023-12-06T20:30",
	symbols=['ES.n.0'],
	stype_in='continuous',
	)
	df = data.to_df()


	# Constructing our target vector
	# Filter out trades only
	df = df[df.action == 'T']

	# Get midprice returns with a forward markout of 500 trades
	df['mid'] = (df['bid_px_00'] + df['ask_px_00'])/2
	df['ret_500t'] = df['mid'].shift(-500) - df['mid']

	df = df.dropna()

	# Constructing our features
	# Depth imbalance on top level ('skew')
	df['skew'] = np.log(df.bid_sz_00) - np.log(df.ask_sz_00)

	# Order imbalance on top ten levels ('imbalance')
	df['imbalance'] = np.log(df[list(df.filter(regex='bid_ct_0[0-9]'))].sum(axis=1)) - \
	np.log(df[list(df.filter(regex='ask_ct_0[0-9]'))].sum(axis=1))

	# Splitting in-sample and out-of-sample
	split = int(0.66 * len(df))
	split -= split % 100
	df_in = df.iloc[:split]
	df_out = df.iloc[split:]

	# Constructing our signal
	corr = df_in[['skew', 'imbalance', 'ret_500t']].corr()
	print(corr.where(np.triu(np.ones(corr.shape)).astype(bool)))

	reg = LinearRegression(fit_intercept=False, positive=True)

	reg.fit(df_in[['skew']], df_in['ret_500t'])
	pred_skew = reg.predict(df_out[['skew']])

	reg.fit(df_in[['imbalance']], df_in['ret_500t'])
	pred_imbalance = reg.predict(df_out[['imbalance']])

	reg.fit(df_in[['skew', 'imbalance']], df_in['ret_500t'])
	pred_combined = reg.predict(df_out[['skew', 'imbalance']])

	# Uncomment these lines to use gradient-boosted trees
	# from sklearn.ensemble import HistGradientBoostingRegressor
	# reg = HistGradientBoostingRegressor()
	# reg.fit(df_in[['skew', 'imbalance']], df_in['ret_500t'])
	# pred_combined = reg.predict(df_out[['skew', 'imbalance']])

	# Results
	# pio.renderers.default = 'notebook'
	# pio.renderers.default = 'iframe'

	pct = np.arange(0, 100, step=100/len(df_out))

	def get_cumulative_markout_pnl(pred):
	df_pnl = pd.DataFrame({'pred': pred, 'ret_500t': df_out['ret_500t'].values})
	df_pnl.loc[df_pnl['pred'] < 0, 'ret_500t'] *= -1
	df_pnl = df_pnl.sort_values(by='pred')
	return df_pnl['ret_500t'].cumsum().values

	results = pd.DataFrame({
	'pct': pct,
	'skew': get_cumulative_markout_pnl(pred_skew),
	'imbalance': get_cumulative_markout_pnl(pred_imbalance),
	'combined': get_cumulative_markout_pnl(pred_combined),
	})

	fig = px.line(
	results, x='pct', y=['skew', 'imbalance', 'combined'],
	title='Forecasting with book skew vs. imbalance',
	labels={'pct': 'Predictor value (percentile)'},
	)

	fig.update_yaxes(title_text='Cumulative return')

	fig.update_layout(legend=dict(
	orientation="h",
	yanchor="bottom",
	y=1.02,
	xanchor="right",
	x=1
	))

	fig.show()