Skip to content

Instantly share code, notes, and snippets.

@hqucms
Created August 12, 2021 15:51
Show Gist options
  • Save hqucms/f71a0223e04452538ee2c8af7cfdf0a1 to your computer and use it in GitHub Desktop.
Save hqucms/f71a0223e04452538ee2c8af7cfdf0a1 to your computer and use it in GitHub Desktop.
import numpy as np
import scipy.stats
import uproot
import awkward as ak
import yaml
import json
import os
import argparse
def _get_variable_names(expr, exclude=['abs']):
import ast
root = ast.parse(expr)
return sorted({node.id for node in ast.walk(root) if isinstance(node, ast.Name)} - set(exclude))
def _pdfunc(arr):
if len(arr) == 33:
# PDF4LHC15_nnlo_30_pdfas
delta = arr - arr[0]
pdfunc = np.sqrt(np.sum(delta[1:31] ** 2))
asunc = (arr[32] - arr[31]) / 2
return np.sqrt(pdfunc**2 + asunc**2)
elif len(arr) == 103:
# NNPDF31_nnlo_hessian_pdfas
delta = arr - arr[0]
pdfunc = np.sqrt(np.sum(delta[1:101] ** 2))
asunc = (arr[102] - arr[101]) / 2
return np.sqrt(pdfunc**2 + asunc**2)
elif len(arr) == 100:
# NNPDF30_nlo_as_0118
lo, hi = np.percentile(arr, [16, 84])
return (hi - lo) / 2
elif len(arr) == 101:
# NNPDF30_lo_as_0118
lo, hi = np.percentile(arr[1:], [16, 84])
return (hi - lo) / 2
elif len(arr) == 102:
# NNPDF30_nlo_nf_5_pdfas
lo, hi = np.percentile(arr[:100], [16, 84])
pdfunc = (hi - lo) / 2
asunc = (arr[101] - arr[100]) / 2
return np.sqrt(pdfunc**2 + asunc**2)
else:
raise NotImplementedError
def load_cfg(opts):
with open(os.path.join(opts['dc_dir'], 'nominal/config.json')) as f:
cfg = yaml.safe_load(f)
# cfg: {samp -> {'file':..., 'weight':..., 'selections': {'$CAT_NAME': ...}}}
with open(opts['samp_config']) as f:
samp_cfg = yaml.safe_load(f)
for samp in cfg:
sname = os.path.basename(cfg[samp]['file']).replace('_tree.root', '')
if len(samp_cfg[sname]) == 1:
cfg[samp]['parts'] = [cfg[samp]['file']]
else:
cfg[samp]['parts'] = [os.path.join(
os.path.dirname(cfg[samp]['file']), 'parts',
(n[0] if isinstance(n, list) else n) + '_tree.root') for n in samp_cfg[sname]]
return cfg
def calc_unc(config, calc_pdf=True, calc_lep=True):
# cat_name -> yields
yields = {}
pdfunc = {}
lepunc = {}
branches = _get_variable_names(config['weight'])
for v in config['selections'].values():
branches.extend(_get_variable_names(v))
branches = set(branches)
# print(branches)
for p in config['parts']:
# print(p)
t = uproot.open(p)['Events']
df = t.arrays(branches & set(t.keys()), library='pd')
if len(df) == 0:
# print('skipping empty file %s' % p)
continue
wgt = df.eval(config['weight'], engine='python')
# ==================== nominal ====================
catsels = {}
for cat, cutstr in config['selections'].items():
sel = df.eval(cutstr, engine='python')
catsels[cat] = sel
y = (wgt * sel).sum()
if cat not in yields:
yields[cat] = float(y)
else:
yields[cat] += float(y)
# ==================== pdfunc ====================
if calc_pdf:
a = t.arrays(['LHEPdfWeight'])
pdfwgt = a['LHEPdfWeight']
num_pdfwgts = ak.to_numpy(ak.num(pdfwgt))
if any(num_pdfwgts != num_pdfwgts[0]) or num_pdfwgts[0] == 0:
# workaround for samples with different number of pdf wgts
mode = scipy.stats.mode(num_pdfwgts)[0][0]
print(p, 'num=', np.unique(num_pdfwgts), 'mode=', mode)
if mode == 0:
mode = 33
pdfwgt = ak.fill_none(ak.pad_none(pdfwgt, mode, clip=True), 1)
pdfwgt = ak.to_numpy(pdfwgt)
for cat, cutstr in config['selections'].items():
sel = catsels[cat]
w = (wgt * sel).values
ypdf = (w[:, None] * pdfwgt).sum(axis=0)
if cat not in pdfunc:
pdfunc[cat] = _pdfunc(ypdf)
else:
pdfunc[cat] += _pdfunc(ypdf)
# ==================== lepunc ====================
if calc_lep:
a = t.arrays(['muEffWeight_UP', 'muEffWeight_DOWN', 'elEffWeight_UP', 'elEffWeight_DOWN'], library='np')
for cat, cutstr in config['selections'].items():
sel = catsels[cat]
for utype in a.keys():
wgtstr = config['weight'].replace(utype.split('_')[0], '1.0')
y = (df.eval(wgtstr, engine='python') * a[utype] * sel).sum()
if utype not in lepunc:
lepunc[utype] = {}
if cat not in lepunc[utype]:
lepunc[utype][cat] = float(y)
else:
lepunc[utype][cat] += float(y)
if calc_pdf:
for k, v in pdfunc.items():
pdfunc[k] = float(1 + v / yields[k] if yields[k] > 0 else 1.001)
if calc_lep:
for utype in lepunc:
for k, v in lepunc[utype].items():
lepunc[utype][k] = float(v / yields[k] if yields[k] > 0 else 1.0)
return {
'yields': yields,
'pdfunc': pdfunc,
'lepunc': lepunc,
}
def run(opts, validate=True):
print('=' * 50)
print(opts)
results = {}
cfg = load_cfg(opts)
results = {k: calc_unc(v, calc_lep=opts['channel'] != '0L') for k, v in cfg.items()}
results['_metadata'] = opts
outpath = os.path.join(opts['dc_dir'], 'syst.json')
with open(outpath, 'w') as fout:
json.dump(results, fout, indent=2)
print('Written output to %s' % outpath)
if validate:
with open(outpath) as f:
rlts = yaml.safe_load(f)
for samp in rlts:
if 'yields' not in rlts[samp]:
continue
for cat, yields in rlts[samp]['yields'].items():
fn = os.path.join(opts['dc_dir'], 'nominal/vhcc_%s.root' % cat)
h = uproot.open(fn)[samp]
tot = np.sum(h.to_numpy()[0])
if abs(tot - yields) > 1e-4:
print(cat, yields, tot, 'diff=', tot - yields)
return results
if __name__ == '__main__':
parser = argparse.ArgumentParser('Calculate systematic uncertainties.')
parser.add_argument('basedir', help='Base diretory.')
parser.add_argument('-y', '--year', default='2016,2017,2018', help='Years. Default: %(default)s')
parser.add_argument('-c', '--channel', default='0L,1L,2L', help='Channel. Default: %(default)s')
parser.add_argument(
'--config-dir', default='/data/hqu/cmssw/nano-vh/CMSSW_11_1_0_pre5_PY3/src/PhysicsTools/NanoTrees/run/samples',
help='Directory containing sample configs used to produce the trees. Default: %(default)s')
args = parser.parse_args()
years = args.year.split(',')
channels = args.channel.split(',')
chn_dict = {'0L': 'ZnnH', '1L': 'WH', '2L': 'ZllH'}
for year in years:
for chn in channels:
opts = {
'year': year,
'channel': chn,
'dc_dir': '{basedir}/VH/Cards/{year}/{chnname}'.format(basedir=args.basedir, year=year, chnname=chn_dict[chn]),
'samp_config': os.path.join(args.config_dir, year, '%s_mc.yaml' % chn)
}
run(opts)
@hqucms
Copy link
Author

hqucms commented Aug 12, 2021

Screen Shot 2021-08-12 at 17 52 47

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment