Skip to content

Instantly share code, notes, and snippets.

@matthewfeickert
Last active March 4, 2019 08:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save matthewfeickert/50020b1f2c5d91097b6d9be6383e9acb to your computer and use it in GitHub Desktop.
Save matthewfeickert/50020b1f2c5d91097b6d9be6383e9acb to your computer and use it in GitHub Desktop.
Working toy model of a simple signal + background model with pseudodata generated from the background model
import matplotlib.pyplot as plt
import numpy as np
import pyhf
def generate_background(n_events, tau=50.0):
"""
Sample events from an exponential distribution
Args:
n_events: The number of events to sample
tau: The rate parameter that defines the exponential
Returns:
Numpy array
"""
return np.random.exponential(tau, n_events)
def generate_signal(n_events, mean=96.0, std=2.0):
"""
Sample events from an exponential distribution
Args:
n_events: The number of events to sample
mean: The mean of the normal distribution
std: The standard deviation of the normal distribution
Returns:
Numpy array
"""
return np.random.normal(mean, std, n_events)
def generate_pseudodata(n_sig, n_bkg):
signal_data = generate_signal(n_sig)
background_data = generate_background(n_bkg)
return np.concatenate((background_data, signal_data))
def get_mc_counts(pdf, pars):
deltas, factors = pdf._modifications(pars)
allsum = pyhf.tensorlib.concatenate(deltas + [pyhf.tensorlib.astensor(pdf.thenom)])
nom_plus_delta = pyhf.tensorlib.sum(allsum, axis=0)
nom_plus_delta = pyhf.tensorlib.reshape(
nom_plus_delta, (1,) + pyhf.tensorlib.shape(nom_plus_delta)
)
allfac = pyhf.tensorlib.concatenate(factors + [nom_plus_delta])
return pyhf.tensorlib.product(allfac, axis=0)
# N.B.: order is the way that it is as there are only 2 samples
# Here be hacky hacky crap, turn and weep at your sins
def plot(
obs_data, pdf, par_name_dict, ax=None, order=[1, 0], labels=None, **par_settings
):
"""
Stacked histograms are hard, so cheat and use barplots instead
"""
if labels is None:
labels = [None] * len(order)
pars = pyhf.tensorlib.astensor(pdf.config.suggested_init())
for k, v in par_settings.items():
pars[par_name_dict[k]] = v
mc_counts = get_mc_counts(pdf, pars)
bottom = None
# nb: bar_data[0] because evaluating only one parset
bin_width = 2
for i, sample_index in enumerate(order):
data = mc_counts[sample_index][0]
x = np.arange(0, bin_width * len(data), bin_width)
ax.bar(
x,
data,
2,
bottom=bottom,
align='edge',
alpha=1.0,
label=labels[sample_index],
)
bottom = data if i == 0 else bottom + data
# print(f'x: {x}\nlength: {len(x)}\n')
# print(f'obs_data: {obs_data}\nlength: {len(obs_data)}\n')
ax.scatter(
x + (bin_width) / 2, obs_data, c='k', alpha=1.0, label=labels[-1], zorder=99
)
def fit_example(n_bkg=10000, n_sig=25, frac_signal_embedded=0):
np.random.seed(0)
# Think about the counting experiment you want to do
# Example 1:
# n_bkg = 10000
# n_sig = 25 # Hard to tease out of the background
# frac_signal_embedded = 0 # Data is background only
# So can't resolve given uncertanties, so CLs should be consistent with bkg only
# Best fit should be unintersting as well
# Example 2:
# n_bkg = 10000
# n_sig = 100 # Obvious bump
# frac_signal_embedded = 0 # Data is background only
# The S+B model poorly models the data
# As the data was generated from n_expected_events the background model does poorly also
# The results tell us that the modeling is just bad
# Example 3:
# n_bkg = 10000
# n_sig = 100 # Obvious bump
# frac_signal_embedded = 1 # Data includes signal
# The S+B model described the data well (Break out champagne)
# The fit nicely pulls out the signal component
# Example 4:
# n_bkg = 10000
# n_sig = 100 # Obvious bump
# frac_signal_embedded = 1.2 # Signal enhacement (some new physics)
# The fit is inconstent with mu=1
n_expected_events = int((1.0 * n_bkg) + (1.0 * n_sig))
# Set binning
bin_width = 2 # GeV
mass_range = [0, 150] # GeV
# numpy/matplotlib expect a right edge too
bin_edges = np.linspace(
mass_range[0],
mass_range[1] + bin_width,
int((mass_range[1] - mass_range[0]) / bin_width) + 1,
endpoint=False,
)
# Build models
bkg_data = generate_background(n_bkg)
sig_data = generate_signal(n_sig)
bkg_model = np.histogram(bkg_data, bins=bin_edges)[0].tolist()
bkg_uncerts = np.sqrt(bkg_model).tolist()
sig_model = np.histogram(sig_data, bins=bin_edges)[0].tolist()
pdf = pyhf.simplemodels.hepdata_like(
signal_data=sig_model, bkg_data=bkg_model, bkg_uncerts=bkg_uncerts
)
# ...and simulate Nature with those models
n_signal_embedded = int(frac_signal_embedded * n_sig)
n_pseudodata = int(
np.random.normal(n_expected_events, np.sqrt(n_expected_events), 1)[0]
)
pseudodata = generate_pseudodata(
n_signal_embedded, n_pseudodata - n_signal_embedded
)
pseudodata_hist = np.histogram(pseudodata, bins=bin_edges)[0].tolist()
# Visualize
# Plot models
plt.figure(figsize=(10, 7))
plt.hist(bkg_data, bins=bin_edges, label='Background model')
plt.hist(sig_data, bins=bin_edges, label='Signal model')
plt.xlim(mass_range[0], mass_range[1])
plt.xlabel('Invariant Mass [GeV]', fontsize=16)
plt.ylabel(f'Number of Candidates [Count / {bin_width} GeV]', fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend()
plt.savefig('bkg_plus_signal.pdf')
# Plot data with model
plt.hist(pseudodata, bins=bin_edges, alpha=0.6, label='Data')
plt.legend()
plt.savefig('data_overlaid.pdf')
# Ask what is the p-value to have observed the data given the
# presence of the signal model with signal strength of mu=1 (CLs_obs)
# ...and the p-value under the background only model (mu=0) (CLs_exp)
CLs_obs, CLs_exp = pyhf.utils.hypotest(
poi_test=1.0,
data=pseudodata_hist + pdf.config.auxdata,
pdf=pdf,
return_expected=True,
)
print(f'Observed: {CLs_obs}, Expected: {CLs_exp}')
# Given the observed data, what model parameters maximize the likelihood?
best_fit = pyhf.optimizer.unconstrained_bestfit(
pyhf.utils.loglambdav,
pseudodata_hist + pdf.config.auxdata,
pdf,
pdf.config.suggested_init(),
pdf.config.suggested_bounds(),
)
par_name_dict = {k: v['slice'].start for k, v in pdf.config.par_map.items()}
best_fit_values = {k: best_fit[v] for k, v in par_name_dict.items()}
fig, ax = plt.subplots(1, 1, figsize=(10, 7))
plot(
pseudodata_hist,
pdf,
par_name_dict,
ax=ax,
labels=[
'Signal model (post fit)',
'Background model (post fit)',
'Observed data',
],
**best_fit_values,
)
ax.plot(
[], [], ' ', label=r'Signal strength $\mu$:' + f" {best_fit_values['mu']:.3f}"
)
plt.xlim(mass_range[0], mass_range[1])
plt.xlabel('Invariant Mass [GeV]', fontsize=16)
plt.ylabel(f'Number of Candidates [Count / {bin_width} GeV]', fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
# manipulate the ordering as I forget how to do this properly
handles, labels = plt.gca().get_legend_handles_labels()
bump_head_to_tail = lambda lst: [*lst[1:], lst[0]]
plt.legend(bump_head_to_tail(handles), bump_head_to_tail(labels))
# plt.show()
fig.savefig('best_fit.pdf')
if __name__ == '__main__':
fit_example()
@matthewfeickert
Copy link
Author

matthewfeickert commented Mar 3, 2019

@stephensekula, if you find that NumPy is running into issues and hitting iteration limits then I would suggest switching over to one of the tensorlibs that actually has autograd in it. For example, to do this with TensorFlow then you'd just have to switch out the backend (after first installing the required additional libraries with pip install pyhf[tensorflow])

import tensorflow as tf

sess = tf.Session()
pyhf.set_backend(pyhf.tensor.tensorflow_backend(session=sess))

and then remember to have the session values get printed properly

print(f'Observed: {sess.run(CLs_obs)}, Expected: {sess.run(CLs_exp)}')

The code stays the same, just what engine is doing the calculations changes. 👍

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment