Skip to content

Instantly share code, notes, and snippets.

@alexpearce
Created December 21, 2016 13:42
Show Gist options
  • Save alexpearce/bc6d675c749333a7c7067a062e756cd6 to your computer and use it in GitHub Desktop.
Save alexpearce/bc6d675c749333a7c7067a062e756cd6 to your computer and use it in GitHub Desktop.
Ue RooFit PDFs with hep_ml.splot
def pdf_probabilities(xs, dependents, yields, pdfs):
"""Return an array of normalised PDF probabilities.
For a list of N yields, each y_i, and a list of N PDFs f_i, the probability
for the ith PDF at the (possibly vector) point x is defined as
p_{i} = \frac{N_{i}f_{i}}{\sum_{i}^{N} N_{i}f_{i}}
Keyword arguments:
xs -- List of values at which to evaluate the PDFs
dependent -- List of same length as xs, as the RooRealVar objects upon
which each PDF depends
yields -- List of RooRealVar objects which hold the fitted values of the
PDF yields
pdfs -- List of RooAbsPdf objects to evaluate the probabilities for
"""
xs = utilities.as_list(xs)
dependents = utilities.as_list(dependents)
depset = ROOT.RooArgSet(*dependents)
# Set the current value of the dependents
for x, dep in zip(xs, dependents):
dep.setVal(x)
ps = []
normalisation = 0
for y, pdf in zip(yields, pdfs):
a = y.getVal()*pdf.getVal(depset)
ps.append(a)
normalisation += a
return [p/normalisation for p in ps]
def add_sweights(data, w=None):
"""Add sWeights from the workspace to the data's DataFrame member.
This method will filter out events from data.df that do not have
corresponding sWeights, i.e. those that did not enter the fit.
If w is None, the default workspace for this dataset will be retrieved.
"""
if w is None:
fpath = workspace_path(data, ['unweighted', 'no-simultaneous'])
with utilities.open_root(fpath) as f:
w = f.Get('w')
pdf_sig = w.pdf('pdf_sig')
pdf_bkg = w.pdf('pdf_bkg')
yield_sig = w.var('yield_sig')
yield_bkg = w.var('yield_bkg')
pdfs = [pdf_sig, pdf_bkg]
yields = [yield_sig, yield_bkg]
dependents = w.var('Lb_DTF_Lc_M')
rdata = w.data(data_name(binned=False))
index, p_sig, p_bkg = [], [], []
for idx in range(rdata.numEntries()):
row = rdata.get(idx)
x = row.find('Lb_DTF_Lc_M').getVal()
ps, pb = pdf_probabilities(x, dependents, yields, pdfs)
p_sig.append(ps)
p_bkg.append(pb)
index.append(int(row.find('__index__').getVal()))
probabilities = pd.DataFrame(dict(sig_sw=p_sig, bkg_sw=p_bkg))
sweights = splot.compute_sweights(probabilities)
# By setting the index of the sWeight DF to the list of indices entering
# the fit, entries in data.df that didn't enter the fit will receive
# sWeight values of NaN, which we can filter out
sweights.index = index
sweights['sum_sw'] = sweights.sig_sw + sweights.bkg_sw
# Drop any existing sWeight columns
try:
data.df.drop(sweights.columns, axis=1, inplace=True)
except ValueError:
pass
data.df = pd.concat([data.df, sweights], axis=1)
data.df.dropna(subset=sweights.columns, inplace=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment