matthewfeickert/.gitignore

## .gitignore
.vscode/

## README.md

      
    Raw
  

              README.md
            
          
    Matt Bellis Question

Question

Email


Suppose I have multiple background contributions and I don't have a good idea of their normalizations. In fact, my uncertainty on their normalizations is greater than the Poisson uncertainty on each individual bin. What I'd ideally like to do then is generate a whole bunch of MC so I get a good description of the shape when I histogram it. Then, I'd like the fit to have the freedom to float both the normalization on any given background and float the individual bins, taking into account that the Poisson uncertainties will change as the normalization changes.

Discussion Summary

Have Monte Carlo template with large number of entries that lends itself to confidence in the shape of the background distribution.
Want to be able to use these samples in search with lower number of entries.
Once this has been done, would like to get the best fit values of the signal and background model normalizations in a fit of the model to data.
Would then like to take this best fit background sample normalization and use it to construct new model with the background sample scaled by the normalization best fit and then perform hypothesis testing for different signal model strengths.
This is basically the second part of this Stack Overflow question.
Proposed Solution

For the Monte Carlo templates you want to construct a model that is representative of what you think you're going to see, so you should construct samples with your hypothesis of the number of background events you expect.
This can be done by just scaling the sample by the appropraite amount (expected yield/current yield) so that the integrated count matches the expectation.
This retains the shape information, while giving you the correct per bin statistical uncertainty.
Expecting the fit to drive down the sample normalizations by perhaps several orders of magntitude is making things more complicated then it needs to be for the fit.
Not sure why you need to care about first fitting for both the signal and background normaliations (a 2 POI fit in some sense) if you are then just going to perform hypothesis testing for signal model.

  
## example.py
import numpy as np
import json
import pyhf


def generate_simulation():
    np.random.seed(0)

    observable_range = [0.0, 10.0]
    bin_width = 0.5
    _bins = np.arange(observable_range[0], observable_range[1] + bin_width, bin_width)
    print(f"nbins =  {len(_bins)}")

    n_bkg = 20_000
    n_signal = 10_000

    # Generate simulation
    bkg_simulation = 10 * np.random.random(n_bkg)
    signal_simulation = np.random.normal(5, 1.0, n_signal)

    bkg_sample, _ = np.histogram(bkg_simulation, bins=_bins)
    signal_sample, _ = np.histogram(signal_simulation, bins=_bins)

    # Generate observations
    # signal_events = np.random.normal(5, 1.0, 1000)
    # bkg_events = 10 * np.random.random(200)
    signal_events = np.random.normal(5, 1.0, int(n_signal * 0.8))
    bkg_events = 10 * np.random.random(int(n_bkg * 1.4))

    observed_events = np.array(signal_events.tolist() + bkg_events.tolist())
    observed_sample, _ = np.histogram(observed_events, bins=_bins)

    return observed_sample, bkg_sample, signal_sample


def build_spec(signal, background, background_uncert):
    spec = {
        "channels": [
            {
                "name": "single_channel",
                "samples": [
                    {
                        "name": "signal",
                        "data": signal.tolist(),
                        "modifiers": [
                            {"name": "mu", "type": "normfactor", "data": None}
                        ],
                    },
                    {
                        "name": "background",
                        "data": background.tolist(),
                        "modifiers": [
                            {
                                "name": "uncorr_bkguncrt",
                                "type": "shapesys",
                                "data": background_uncert.tolist(),
                            },
                            {"name": "bkg_norm", "type": "normfactor", "data": None},
                            {
                                "name": "bkg_norm_sys",
                                "type": "normsys",
                                "data": {"hi": 1.1, "lo": 0.9},
                            },
                        ],
                    },
                ],
            }
        ]
    }
    return spec


def _get_par_slice(model, par_name):
    # par_name_slice_map = {
    #     name: view["slice"] for name, view in model.config.par_map.items()
    # }
    # return par_name_slice_map[par_name]
    return model.config.par_map[par_name]["slice"]


def main():
    observed_sample, bkg_sample, signal_sample = generate_simulation()
    # Poisson uncertainty
    bkg_uncert = np.sqrt(bkg_sample)

    spec = build_spec(signal_sample, bkg_sample, bkg_uncert)
    model = pyhf.Model(spec)
    data = pyhf.tensorlib.astensor(observed_sample.tolist() + model.config.auxdata)

    bestfit_pars = pyhf.infer.mle.fit(data, model)

    print()
    for name in model.config.parameters:
        print(f"best fit {name}: {bestfit_pars[_get_par_slice(model, name)]}")

    with open("workspace.json", "w") as serialization:
        json.dump(model.spec, serialization)


if __name__ == "__main__":
    main()

## requirements.txt
pyhf~=0.5
black
	import numpy as np
	import json
	import pyhf


	def generate_simulation():
	np.random.seed(0)

	observable_range = [0.0, 10.0]
	bin_width = 0.5
	_bins = np.arange(observable_range[0], observable_range[1] + bin_width, bin_width)
	print(f"nbins = {len(_bins)}")

	n_bkg = 20_000
	n_signal = 10_000

	# Generate simulation
	bkg_simulation = 10 * np.random.random(n_bkg)
	signal_simulation = np.random.normal(5, 1.0, n_signal)

	bkg_sample, _ = np.histogram(bkg_simulation, bins=_bins)
	signal_sample, _ = np.histogram(signal_simulation, bins=_bins)

	# Generate observations
	# signal_events = np.random.normal(5, 1.0, 1000)
	# bkg_events = 10 * np.random.random(200)
	signal_events = np.random.normal(5, 1.0, int(n_signal * 0.8))
	bkg_events = 10 * np.random.random(int(n_bkg * 1.4))

	observed_events = np.array(signal_events.tolist() + bkg_events.tolist())
	observed_sample, _ = np.histogram(observed_events, bins=_bins)

	return observed_sample, bkg_sample, signal_sample


	def build_spec(signal, background, background_uncert):
	spec = {
	"channels": [
	{
	"name": "single_channel",
	"samples": [
	{
	"name": "signal",
	"data": signal.tolist(),
	"modifiers": [
	{"name": "mu", "type": "normfactor", "data": None}
	],
	},
	{
	"name": "background",
	"data": background.tolist(),
	"modifiers": [
	{
	"name": "uncorr_bkguncrt",
	"type": "shapesys",
	"data": background_uncert.tolist(),
	},
	{"name": "bkg_norm", "type": "normfactor", "data": None},
	{
	"name": "bkg_norm_sys",
	"type": "normsys",
	"data": {"hi": 1.1, "lo": 0.9},
	},
	],
	},
	],
	}
	]
	}
	return spec


	def _get_par_slice(model, par_name):
	# par_name_slice_map = {
	# name: view["slice"] for name, view in model.config.par_map.items()
	# }
	# return par_name_slice_map[par_name]
	return model.config.par_map[par_name]["slice"]


	def main():
	observed_sample, bkg_sample, signal_sample = generate_simulation()
	# Poisson uncertainty
	bkg_uncert = np.sqrt(bkg_sample)

	spec = build_spec(signal_sample, bkg_sample, bkg_uncert)
	model = pyhf.Model(spec)
	data = pyhf.tensorlib.astensor(observed_sample.tolist() + model.config.auxdata)

	bestfit_pars = pyhf.infer.mle.fit(data, model)

	print()
	for name in model.config.parameters:
	print(f"best fit {name}: {bestfit_pars[_get_par_slice(model, name)]}")

	with open("workspace.json", "w") as serialization:
	json.dump(model.spec, serialization)


	if __name__ == "__main__":
	main()