matthewfeickert/sekula_pyhf.py

## sekula_pyhf.py
import matplotlib.pyplot as plt
import numpy as np
import pyhf


def generate_background(n_events, tau=50.0):
    """
    Sample events from an exponential distribution

    Args:
        n_events: The number of events to sample
        tau: The rate parameter that defines the exponential

    Returns:
        Numpy array
    """
    return np.random.exponential(tau, n_events)


def generate_signal(n_events, mean=96.0, std=2.0):
    """
    Sample events from an exponential distribution

    Args:
        n_events: The number of events to sample
        mean: The mean of the normal distribution
        std: The standard deviation of the normal distribution

    Returns:
        Numpy array
    """
    return np.random.normal(mean, std, n_events)


def generate_pseudodata(n_sig, n_bkg):
    signal_data = generate_signal(n_sig)
    background_data = generate_background(n_bkg)
    return np.concatenate((background_data, signal_data))


def get_mc_counts(pdf, pars):
    deltas, factors = pdf._modifications(pars)
    allsum = pyhf.tensorlib.concatenate(deltas + [pyhf.tensorlib.astensor(pdf.thenom)])
    nom_plus_delta = pyhf.tensorlib.sum(allsum, axis=0)
    nom_plus_delta = pyhf.tensorlib.reshape(
        nom_plus_delta, (1,) + pyhf.tensorlib.shape(nom_plus_delta)
    )
    allfac = pyhf.tensorlib.concatenate(factors + [nom_plus_delta])
    return pyhf.tensorlib.product(allfac, axis=0)


# N.B.: order is the way that it is as there are only 2 samples
# Here be hacky hacky crap, turn and weep at your sins
def plot(
    obs_data, pdf, par_name_dict, ax=None, order=[1, 0], labels=None, **par_settings
):
    """
    Stacked histograms are hard, so cheat and use barplots instead
    """
    if labels is None:
        labels = [None] * len(order)
    pars = pyhf.tensorlib.astensor(pdf.config.suggested_init())
    for k, v in par_settings.items():
        pars[par_name_dict[k]] = v

    mc_counts = get_mc_counts(pdf, pars)
    bottom = None
    # nb: bar_data[0] because evaluating only one parset
    bin_width = 2
    for i, sample_index in enumerate(order):
        data = mc_counts[sample_index][0]
        x = np.arange(0, bin_width * len(data), bin_width)
        ax.bar(
            x,
            data,
            2,
            bottom=bottom,
            align='edge',
            alpha=1.0,
            label=labels[sample_index],
        )
        bottom = data if i == 0 else bottom + data
    # print(f'x: {x}\nlength: {len(x)}\n')
    # print(f'obs_data: {obs_data}\nlength: {len(obs_data)}\n')
    ax.scatter(
        x + (bin_width) / 2, obs_data, c='k', alpha=1.0, label=labels[-1], zorder=99
    )


def fit_example(n_bkg=10000, n_sig=25, frac_signal_embedded=0):
    np.random.seed(0)

    # Think about the counting experiment you want to do

    # Example 1:
    # n_bkg = 10000
    # n_sig = 25  # Hard to tease out of the background
    # frac_signal_embedded = 0  # Data is background only
    # So can't resolve given uncertanties, so CLs should be consistent with bkg only
    # Best fit should be unintersting as well

    # Example 2:
    # n_bkg = 10000
    # n_sig = 100  # Obvious bump
    # frac_signal_embedded = 0  # Data is background only
    # The S+B model poorly models the data
    # As the data was generated from n_expected_events the background model does poorly also
    # The results tell us that the modeling is just bad

    # Example 3:
    # n_bkg = 10000
    # n_sig = 100  # Obvious bump
    # frac_signal_embedded = 1  # Data includes signal
    # The S+B model described the data well (Break out champagne)
    # The fit nicely pulls out the signal component

    # Example 4:
    # n_bkg = 10000
    # n_sig = 100  # Obvious bump
    # frac_signal_embedded = 1.2  # Signal enhacement (some new physics)
    # The fit is inconstent with mu=1

    n_expected_events = int((1.0 * n_bkg) + (1.0 * n_sig))

    # Set binning
    bin_width = 2  # GeV
    mass_range = [0, 150]  # GeV
    # numpy/matplotlib expect a right edge too
    bin_edges = np.linspace(
        mass_range[0],
        mass_range[1] + bin_width,
        int((mass_range[1] - mass_range[0]) / bin_width) + 1,
        endpoint=False,
    )

    # Build models
    bkg_data = generate_background(n_bkg)
    sig_data = generate_signal(n_sig)

    bkg_model = np.histogram(bkg_data, bins=bin_edges)[0].tolist()
    bkg_uncerts = np.sqrt(bkg_model).tolist()
    sig_model = np.histogram(sig_data, bins=bin_edges)[0].tolist()

    pdf = pyhf.simplemodels.hepdata_like(
        signal_data=sig_model, bkg_data=bkg_model, bkg_uncerts=bkg_uncerts
    )

    # ...and simulate Nature with those models
    n_signal_embedded = int(frac_signal_embedded * n_sig)
    n_pseudodata = int(
        np.random.normal(n_expected_events, np.sqrt(n_expected_events), 1)[0]
    )
    pseudodata = generate_pseudodata(
        n_signal_embedded, n_pseudodata - n_signal_embedded
    )
    pseudodata_hist = np.histogram(pseudodata, bins=bin_edges)[0].tolist()

    # Visualize
    # Plot models
    plt.figure(figsize=(10, 7))
    plt.hist(bkg_data, bins=bin_edges, label='Background model')
    plt.hist(sig_data, bins=bin_edges, label='Signal model')
    plt.xlim(mass_range[0], mass_range[1])
    plt.xlabel('Invariant Mass [GeV]', fontsize=16)
    plt.ylabel(f'Number of Candidates [Count / {bin_width} GeV]', fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)

    plt.legend()
    plt.savefig('bkg_plus_signal.pdf')

    # Plot data with model
    plt.hist(pseudodata, bins=bin_edges, alpha=0.6, label='Data')
    plt.legend()
    plt.savefig('data_overlaid.pdf')

    # Ask what is the p-value to have observed the data given the
    # presence of the signal model with signal strength of mu=1 (CLs_obs)
    # ...and the p-value under the background only model (mu=0) (CLs_exp)
    CLs_obs, CLs_exp = pyhf.utils.hypotest(
        poi_test=1.0,
        data=pseudodata_hist + pdf.config.auxdata,
        pdf=pdf,
        return_expected=True,
    )
    print(f'Observed: {CLs_obs}, Expected: {CLs_exp}')

    # Given the observed data, what model parameters maximize the likelihood?
    best_fit = pyhf.optimizer.unconstrained_bestfit(
        pyhf.utils.loglambdav,
        pseudodata_hist + pdf.config.auxdata,
        pdf,
        pdf.config.suggested_init(),
        pdf.config.suggested_bounds(),
    )
    par_name_dict = {k: v['slice'].start for k, v in pdf.config.par_map.items()}
    best_fit_values = {k: best_fit[v] for k, v in par_name_dict.items()}
    fig, ax = plt.subplots(1, 1, figsize=(10, 7))
    plot(
        pseudodata_hist,
        pdf,
        par_name_dict,
        ax=ax,
        labels=[
            'Signal model (post fit)',
            'Background model (post fit)',
            'Observed data',
        ],
        **best_fit_values,
    )
    ax.plot(
        [], [], ' ', label=r'Signal strength $\mu$:' + f" {best_fit_values['mu']:.3f}"
    )
    plt.xlim(mass_range[0], mass_range[1])
    plt.xlabel('Invariant Mass [GeV]', fontsize=16)
    plt.ylabel(f'Number of Candidates [Count / {bin_width} GeV]', fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    # manipulate the ordering as I forget how to do this properly
    handles, labels = plt.gca().get_legend_handles_labels()
    bump_head_to_tail = lambda lst: [*lst[1:], lst[0]]
    plt.legend(bump_head_to_tail(handles), bump_head_to_tail(labels))
    # plt.show()
    fig.savefig('best_fit.pdf')


if __name__ == '__main__':
    fit_example()
	import matplotlib.pyplot as plt
	import numpy as np
	import pyhf


	def generate_background(n_events, tau=50.0):
	"""
	Sample events from an exponential distribution

	Args:
	n_events: The number of events to sample
	tau: The rate parameter that defines the exponential

	Returns:
	Numpy array
	"""
	return np.random.exponential(tau, n_events)


	def generate_signal(n_events, mean=96.0, std=2.0):
	"""
	Sample events from an exponential distribution

	Args:
	n_events: The number of events to sample
	mean: The mean of the normal distribution
	std: The standard deviation of the normal distribution

	Returns:
	Numpy array
	"""
	return np.random.normal(mean, std, n_events)


	def generate_pseudodata(n_sig, n_bkg):
	signal_data = generate_signal(n_sig)
	background_data = generate_background(n_bkg)
	return np.concatenate((background_data, signal_data))


	def get_mc_counts(pdf, pars):
	deltas, factors = pdf._modifications(pars)
	allsum = pyhf.tensorlib.concatenate(deltas + [pyhf.tensorlib.astensor(pdf.thenom)])
	nom_plus_delta = pyhf.tensorlib.sum(allsum, axis=0)
	nom_plus_delta = pyhf.tensorlib.reshape(
	nom_plus_delta, (1,) + pyhf.tensorlib.shape(nom_plus_delta)
	)
	allfac = pyhf.tensorlib.concatenate(factors + [nom_plus_delta])
	return pyhf.tensorlib.product(allfac, axis=0)


	# N.B.: order is the way that it is as there are only 2 samples
	# Here be hacky hacky crap, turn and weep at your sins
	def plot(
	obs_data, pdf, par_name_dict, ax=None, order=[1, 0], labels=None, **par_settings
	):
	"""
	Stacked histograms are hard, so cheat and use barplots instead
	"""
	if labels is None:
	labels = [None] * len(order)
	pars = pyhf.tensorlib.astensor(pdf.config.suggested_init())
	for k, v in par_settings.items():
	pars[par_name_dict[k]] = v

	mc_counts = get_mc_counts(pdf, pars)
	bottom = None
	# nb: bar_data[0] because evaluating only one parset
	bin_width = 2
	for i, sample_index in enumerate(order):
	data = mc_counts[sample_index][0]
	x = np.arange(0, bin_width * len(data), bin_width)
	ax.bar(
	x,
	data,
	2,
	bottom=bottom,
	align='edge',
	alpha=1.0,
	label=labels[sample_index],
	)
	bottom = data if i == 0 else bottom + data
	# print(f'x: {x}\nlength: {len(x)}\n')
	# print(f'obs_data: {obs_data}\nlength: {len(obs_data)}\n')
	ax.scatter(
	x + (bin_width) / 2, obs_data, c='k', alpha=1.0, label=labels[-1], zorder=99
	)


	def fit_example(n_bkg=10000, n_sig=25, frac_signal_embedded=0):
	np.random.seed(0)

	# Think about the counting experiment you want to do

	# Example 1:
	# n_bkg = 10000
	# n_sig = 25 # Hard to tease out of the background
	# frac_signal_embedded = 0 # Data is background only
	# So can't resolve given uncertanties, so CLs should be consistent with bkg only
	# Best fit should be unintersting as well

	# Example 2:
	# n_bkg = 10000
	# n_sig = 100 # Obvious bump
	# frac_signal_embedded = 0 # Data is background only
	# The S+B model poorly models the data
	# As the data was generated from n_expected_events the background model does poorly also
	# The results tell us that the modeling is just bad

	# Example 3:
	# n_bkg = 10000
	# n_sig = 100 # Obvious bump
	# frac_signal_embedded = 1 # Data includes signal
	# The S+B model described the data well (Break out champagne)
	# The fit nicely pulls out the signal component

	# Example 4:
	# n_bkg = 10000
	# n_sig = 100 # Obvious bump
	# frac_signal_embedded = 1.2 # Signal enhacement (some new physics)
	# The fit is inconstent with mu=1

	n_expected_events = int((1.0 * n_bkg) + (1.0 * n_sig))

	# Set binning
	bin_width = 2 # GeV
	mass_range = [0, 150] # GeV
	# numpy/matplotlib expect a right edge too
	bin_edges = np.linspace(
	mass_range[0],
	mass_range[1] + bin_width,
	int((mass_range[1] - mass_range[0]) / bin_width) + 1,
	endpoint=False,
	)

	# Build models
	bkg_data = generate_background(n_bkg)
	sig_data = generate_signal(n_sig)

	bkg_model = np.histogram(bkg_data, bins=bin_edges)[0].tolist()
	bkg_uncerts = np.sqrt(bkg_model).tolist()
	sig_model = np.histogram(sig_data, bins=bin_edges)[0].tolist()

	pdf = pyhf.simplemodels.hepdata_like(
	signal_data=sig_model, bkg_data=bkg_model, bkg_uncerts=bkg_uncerts
	)

	# ...and simulate Nature with those models
	n_signal_embedded = int(frac_signal_embedded * n_sig)
	n_pseudodata = int(
	np.random.normal(n_expected_events, np.sqrt(n_expected_events), 1)[0]
	)
	pseudodata = generate_pseudodata(
	n_signal_embedded, n_pseudodata - n_signal_embedded
	)
	pseudodata_hist = np.histogram(pseudodata, bins=bin_edges)[0].tolist()

	# Visualize
	# Plot models
	plt.figure(figsize=(10, 7))
	plt.hist(bkg_data, bins=bin_edges, label='Background model')
	plt.hist(sig_data, bins=bin_edges, label='Signal model')
	plt.xlim(mass_range[0], mass_range[1])
	plt.xlabel('Invariant Mass [GeV]', fontsize=16)
	plt.ylabel(f'Number of Candidates [Count / {bin_width} GeV]', fontsize=16)
	plt.xticks(fontsize=14)
	plt.yticks(fontsize=14)

	plt.legend()
	plt.savefig('bkg_plus_signal.pdf')

	# Plot data with model
	plt.hist(pseudodata, bins=bin_edges, alpha=0.6, label='Data')
	plt.legend()
	plt.savefig('data_overlaid.pdf')

	# Ask what is the p-value to have observed the data given the
	# presence of the signal model with signal strength of mu=1 (CLs_obs)
	# ...and the p-value under the background only model (mu=0) (CLs_exp)
	CLs_obs, CLs_exp = pyhf.utils.hypotest(
	poi_test=1.0,
	data=pseudodata_hist + pdf.config.auxdata,
	pdf=pdf,
	return_expected=True,
	)
	print(f'Observed: {CLs_obs}, Expected: {CLs_exp}')

	# Given the observed data, what model parameters maximize the likelihood?
	best_fit = pyhf.optimizer.unconstrained_bestfit(
	pyhf.utils.loglambdav,
	pseudodata_hist + pdf.config.auxdata,
	pdf,
	pdf.config.suggested_init(),
	pdf.config.suggested_bounds(),
	)
	par_name_dict = {k: v['slice'].start for k, v in pdf.config.par_map.items()}
	best_fit_values = {k: best_fit[v] for k, v in par_name_dict.items()}
	fig, ax = plt.subplots(1, 1, figsize=(10, 7))
	plot(
	pseudodata_hist,
	pdf,
	par_name_dict,
	ax=ax,
	labels=[
	'Signal model (post fit)',
	'Background model (post fit)',
	'Observed data',
	],
	**best_fit_values,
	)
	ax.plot(
	[], [], ' ', label=r'Signal strength $\mu$:' + f" {best_fit_values['mu']:.3f}"
	)
	plt.xlim(mass_range[0], mass_range[1])
	plt.xlabel('Invariant Mass [GeV]', fontsize=16)
	plt.ylabel(f'Number of Candidates [Count / {bin_width} GeV]', fontsize=16)
	plt.xticks(fontsize=14)
	plt.yticks(fontsize=14)
	# manipulate the ordering as I forget how to do this properly
	handles, labels = plt.gca().get_legend_handles_labels()
	bump_head_to_tail = lambda lst: [*lst[1:], lst[0]]
	plt.legend(bump_head_to_tail(handles), bump_head_to_tail(labels))
	# plt.show()
	fig.savefig('best_fit.pdf')


	if __name__ == '__main__':
	fit_example()