Created
August 12, 2021 15:51
-
-
Save hqucms/f71a0223e04452538ee2c8af7cfdf0a1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import scipy.stats | |
import uproot | |
import awkward as ak | |
import yaml | |
import json | |
import os | |
import argparse | |
def _get_variable_names(expr, exclude=['abs']): | |
import ast | |
root = ast.parse(expr) | |
return sorted({node.id for node in ast.walk(root) if isinstance(node, ast.Name)} - set(exclude)) | |
def _pdfunc(arr): | |
if len(arr) == 33: | |
# PDF4LHC15_nnlo_30_pdfas | |
delta = arr - arr[0] | |
pdfunc = np.sqrt(np.sum(delta[1:31] ** 2)) | |
asunc = (arr[32] - arr[31]) / 2 | |
return np.sqrt(pdfunc**2 + asunc**2) | |
elif len(arr) == 103: | |
# NNPDF31_nnlo_hessian_pdfas | |
delta = arr - arr[0] | |
pdfunc = np.sqrt(np.sum(delta[1:101] ** 2)) | |
asunc = (arr[102] - arr[101]) / 2 | |
return np.sqrt(pdfunc**2 + asunc**2) | |
elif len(arr) == 100: | |
# NNPDF30_nlo_as_0118 | |
lo, hi = np.percentile(arr, [16, 84]) | |
return (hi - lo) / 2 | |
elif len(arr) == 101: | |
# NNPDF30_lo_as_0118 | |
lo, hi = np.percentile(arr[1:], [16, 84]) | |
return (hi - lo) / 2 | |
elif len(arr) == 102: | |
# NNPDF30_nlo_nf_5_pdfas | |
lo, hi = np.percentile(arr[:100], [16, 84]) | |
pdfunc = (hi - lo) / 2 | |
asunc = (arr[101] - arr[100]) / 2 | |
return np.sqrt(pdfunc**2 + asunc**2) | |
else: | |
raise NotImplementedError | |
def load_cfg(opts): | |
with open(os.path.join(opts['dc_dir'], 'nominal/config.json')) as f: | |
cfg = yaml.safe_load(f) | |
# cfg: {samp -> {'file':..., 'weight':..., 'selections': {'$CAT_NAME': ...}}} | |
with open(opts['samp_config']) as f: | |
samp_cfg = yaml.safe_load(f) | |
for samp in cfg: | |
sname = os.path.basename(cfg[samp]['file']).replace('_tree.root', '') | |
if len(samp_cfg[sname]) == 1: | |
cfg[samp]['parts'] = [cfg[samp]['file']] | |
else: | |
cfg[samp]['parts'] = [os.path.join( | |
os.path.dirname(cfg[samp]['file']), 'parts', | |
(n[0] if isinstance(n, list) else n) + '_tree.root') for n in samp_cfg[sname]] | |
return cfg | |
def calc_unc(config, calc_pdf=True, calc_lep=True): | |
# cat_name -> yields | |
yields = {} | |
pdfunc = {} | |
lepunc = {} | |
branches = _get_variable_names(config['weight']) | |
for v in config['selections'].values(): | |
branches.extend(_get_variable_names(v)) | |
branches = set(branches) | |
# print(branches) | |
for p in config['parts']: | |
# print(p) | |
t = uproot.open(p)['Events'] | |
df = t.arrays(branches & set(t.keys()), library='pd') | |
if len(df) == 0: | |
# print('skipping empty file %s' % p) | |
continue | |
wgt = df.eval(config['weight'], engine='python') | |
# ==================== nominal ==================== | |
catsels = {} | |
for cat, cutstr in config['selections'].items(): | |
sel = df.eval(cutstr, engine='python') | |
catsels[cat] = sel | |
y = (wgt * sel).sum() | |
if cat not in yields: | |
yields[cat] = float(y) | |
else: | |
yields[cat] += float(y) | |
# ==================== pdfunc ==================== | |
if calc_pdf: | |
a = t.arrays(['LHEPdfWeight']) | |
pdfwgt = a['LHEPdfWeight'] | |
num_pdfwgts = ak.to_numpy(ak.num(pdfwgt)) | |
if any(num_pdfwgts != num_pdfwgts[0]) or num_pdfwgts[0] == 0: | |
# workaround for samples with different number of pdf wgts | |
mode = scipy.stats.mode(num_pdfwgts)[0][0] | |
print(p, 'num=', np.unique(num_pdfwgts), 'mode=', mode) | |
if mode == 0: | |
mode = 33 | |
pdfwgt = ak.fill_none(ak.pad_none(pdfwgt, mode, clip=True), 1) | |
pdfwgt = ak.to_numpy(pdfwgt) | |
for cat, cutstr in config['selections'].items(): | |
sel = catsels[cat] | |
w = (wgt * sel).values | |
ypdf = (w[:, None] * pdfwgt).sum(axis=0) | |
if cat not in pdfunc: | |
pdfunc[cat] = _pdfunc(ypdf) | |
else: | |
pdfunc[cat] += _pdfunc(ypdf) | |
# ==================== lepunc ==================== | |
if calc_lep: | |
a = t.arrays(['muEffWeight_UP', 'muEffWeight_DOWN', 'elEffWeight_UP', 'elEffWeight_DOWN'], library='np') | |
for cat, cutstr in config['selections'].items(): | |
sel = catsels[cat] | |
for utype in a.keys(): | |
wgtstr = config['weight'].replace(utype.split('_')[0], '1.0') | |
y = (df.eval(wgtstr, engine='python') * a[utype] * sel).sum() | |
if utype not in lepunc: | |
lepunc[utype] = {} | |
if cat not in lepunc[utype]: | |
lepunc[utype][cat] = float(y) | |
else: | |
lepunc[utype][cat] += float(y) | |
if calc_pdf: | |
for k, v in pdfunc.items(): | |
pdfunc[k] = float(1 + v / yields[k] if yields[k] > 0 else 1.001) | |
if calc_lep: | |
for utype in lepunc: | |
for k, v in lepunc[utype].items(): | |
lepunc[utype][k] = float(v / yields[k] if yields[k] > 0 else 1.0) | |
return { | |
'yields': yields, | |
'pdfunc': pdfunc, | |
'lepunc': lepunc, | |
} | |
def run(opts, validate=True): | |
print('=' * 50) | |
print(opts) | |
results = {} | |
cfg = load_cfg(opts) | |
results = {k: calc_unc(v, calc_lep=opts['channel'] != '0L') for k, v in cfg.items()} | |
results['_metadata'] = opts | |
outpath = os.path.join(opts['dc_dir'], 'syst.json') | |
with open(outpath, 'w') as fout: | |
json.dump(results, fout, indent=2) | |
print('Written output to %s' % outpath) | |
if validate: | |
with open(outpath) as f: | |
rlts = yaml.safe_load(f) | |
for samp in rlts: | |
if 'yields' not in rlts[samp]: | |
continue | |
for cat, yields in rlts[samp]['yields'].items(): | |
fn = os.path.join(opts['dc_dir'], 'nominal/vhcc_%s.root' % cat) | |
h = uproot.open(fn)[samp] | |
tot = np.sum(h.to_numpy()[0]) | |
if abs(tot - yields) > 1e-4: | |
print(cat, yields, tot, 'diff=', tot - yields) | |
return results | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser('Calculate systematic uncertainties.') | |
parser.add_argument('basedir', help='Base diretory.') | |
parser.add_argument('-y', '--year', default='2016,2017,2018', help='Years. Default: %(default)s') | |
parser.add_argument('-c', '--channel', default='0L,1L,2L', help='Channel. Default: %(default)s') | |
parser.add_argument( | |
'--config-dir', default='/data/hqu/cmssw/nano-vh/CMSSW_11_1_0_pre5_PY3/src/PhysicsTools/NanoTrees/run/samples', | |
help='Directory containing sample configs used to produce the trees. Default: %(default)s') | |
args = parser.parse_args() | |
years = args.year.split(',') | |
channels = args.channel.split(',') | |
chn_dict = {'0L': 'ZnnH', '1L': 'WH', '2L': 'ZllH'} | |
for year in years: | |
for chn in channels: | |
opts = { | |
'year': year, | |
'channel': chn, | |
'dc_dir': '{basedir}/VH/Cards/{year}/{chnname}'.format(basedir=args.basedir, year=year, chnname=chn_dict[chn]), | |
'samp_config': os.path.join(args.config_dir, year, '%s_mc.yaml' % chn) | |
} | |
run(opts) |
Author
hqucms
commented
Aug 12, 2021
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment