hqucms/calcPDFSyst.py

## calcPDFSyst.py
import numpy as np
import scipy.stats
import uproot
import awkward as ak
import yaml
import json
import os
import argparse


def _get_variable_names(expr, exclude=['abs']):
    import ast
    root = ast.parse(expr)
    return sorted({node.id for node in ast.walk(root) if isinstance(node, ast.Name)} - set(exclude))


def _pdfunc(arr):
    if len(arr) == 33:
        # PDF4LHC15_nnlo_30_pdfas
        delta = arr - arr[0]
        pdfunc = np.sqrt(np.sum(delta[1:31] ** 2))
        asunc = (arr[32] - arr[31]) / 2
        return np.sqrt(pdfunc**2 + asunc**2)
    elif len(arr) == 103:
        # NNPDF31_nnlo_hessian_pdfas
        delta = arr - arr[0]
        pdfunc = np.sqrt(np.sum(delta[1:101] ** 2))
        asunc = (arr[102] - arr[101]) / 2
        return np.sqrt(pdfunc**2 + asunc**2)
    elif len(arr) == 100:
        # NNPDF30_nlo_as_0118
        lo, hi = np.percentile(arr, [16, 84])
        return (hi - lo) / 2
    elif len(arr) == 101:
        # NNPDF30_lo_as_0118
        lo, hi = np.percentile(arr[1:], [16, 84])
        return (hi - lo) / 2
    elif len(arr) == 102:
        # NNPDF30_nlo_nf_5_pdfas
        lo, hi = np.percentile(arr[:100], [16, 84])
        pdfunc = (hi - lo) / 2
        asunc = (arr[101] - arr[100]) / 2
        return np.sqrt(pdfunc**2 + asunc**2)
    else:
        raise NotImplementedError


def load_cfg(opts):
    with open(os.path.join(opts['dc_dir'], 'nominal/config.json')) as f:
        cfg = yaml.safe_load(f)
    # cfg: {samp -> {'file':..., 'weight':..., 'selections': {'$CAT_NAME': ...}}}

    with open(opts['samp_config']) as f:
        samp_cfg = yaml.safe_load(f)

    for samp in cfg:
        sname = os.path.basename(cfg[samp]['file']).replace('_tree.root', '')
        if len(samp_cfg[sname]) == 1:
            cfg[samp]['parts'] = [cfg[samp]['file']]
        else:
            cfg[samp]['parts'] = [os.path.join(
                os.path.dirname(cfg[samp]['file']), 'parts',
                (n[0] if isinstance(n, list) else n) + '_tree.root') for n in samp_cfg[sname]]

    return cfg


def calc_unc(config, calc_pdf=True, calc_lep=True):
    # cat_name -> yields
    yields = {}
    pdfunc = {}
    lepunc = {}

    branches = _get_variable_names(config['weight'])
    for v in config['selections'].values():
        branches.extend(_get_variable_names(v))
    branches = set(branches)
    # print(branches)

    for p in config['parts']:
        # print(p)
        t = uproot.open(p)['Events']
        df = t.arrays(branches & set(t.keys()), library='pd')
        if len(df) == 0:
            # print('skipping empty file %s' % p)
            continue

        wgt = df.eval(config['weight'], engine='python')

        # ==================== nominal ====================
        catsels = {}
        for cat, cutstr in config['selections'].items():
            sel = df.eval(cutstr, engine='python')
            catsels[cat] = sel
            y = (wgt * sel).sum()
            if cat not in yields:
                yields[cat] = float(y)
            else:
                yields[cat] += float(y)

        # ==================== pdfunc ====================
        if calc_pdf:
            a = t.arrays(['LHEPdfWeight'])
            pdfwgt = a['LHEPdfWeight']

            num_pdfwgts = ak.to_numpy(ak.num(pdfwgt))
            if any(num_pdfwgts != num_pdfwgts[0]) or num_pdfwgts[0] == 0:
                # workaround for samples with different number of pdf wgts
                mode = scipy.stats.mode(num_pdfwgts)[0][0]
                print(p, 'num=', np.unique(num_pdfwgts), 'mode=', mode)
                if mode == 0:
                    mode = 33
                pdfwgt = ak.fill_none(ak.pad_none(pdfwgt, mode, clip=True), 1)

            pdfwgt = ak.to_numpy(pdfwgt)
            for cat, cutstr in config['selections'].items():
                sel = catsels[cat]
                w = (wgt * sel).values
                ypdf = (w[:, None] * pdfwgt).sum(axis=0)
                if cat not in pdfunc:
                    pdfunc[cat] = _pdfunc(ypdf)
                else:
                    pdfunc[cat] += _pdfunc(ypdf)

        # ==================== lepunc ====================
        if calc_lep:
            a = t.arrays(['muEffWeight_UP', 'muEffWeight_DOWN', 'elEffWeight_UP', 'elEffWeight_DOWN'], library='np')
            for cat, cutstr in config['selections'].items():
                sel = catsels[cat]
                for utype in a.keys():
                    wgtstr = config['weight'].replace(utype.split('_')[0], '1.0')
                    y = (df.eval(wgtstr, engine='python') * a[utype] * sel).sum()

                    if utype not in lepunc:
                        lepunc[utype] = {}

                    if cat not in lepunc[utype]:
                        lepunc[utype][cat] = float(y)
                    else:
                        lepunc[utype][cat] += float(y)

    if calc_pdf:
        for k, v in pdfunc.items():
            pdfunc[k] = float(1 + v / yields[k] if yields[k] > 0 else 1.001)

    if calc_lep:
        for utype in lepunc:
            for k, v in lepunc[utype].items():
                lepunc[utype][k] = float(v / yields[k] if yields[k] > 0 else 1.0)

    return {
        'yields': yields,
        'pdfunc': pdfunc,
        'lepunc': lepunc,
    }


def run(opts, validate=True):
    print('=' * 50)
    print(opts)

    results = {}
    cfg = load_cfg(opts)
    results = {k: calc_unc(v, calc_lep=opts['channel'] != '0L') for k, v in cfg.items()}
    results['_metadata'] = opts

    outpath = os.path.join(opts['dc_dir'], 'syst.json')
    with open(outpath, 'w') as fout:
        json.dump(results, fout, indent=2)
    print('Written output to %s' % outpath)

    if validate:
        with open(outpath) as f:
            rlts = yaml.safe_load(f)

        for samp in rlts:
            if 'yields' not in rlts[samp]:
                continue
            for cat, yields in rlts[samp]['yields'].items():
                fn = os.path.join(opts['dc_dir'], 'nominal/vhcc_%s.root' % cat)
                h = uproot.open(fn)[samp]
                tot = np.sum(h.to_numpy()[0])
                if abs(tot - yields) > 1e-4:
                    print(cat, yields, tot, 'diff=', tot - yields)

    return results


if __name__ == '__main__':
    parser = argparse.ArgumentParser('Calculate systematic uncertainties.')
    parser.add_argument('basedir', help='Base diretory.')
    parser.add_argument('-y', '--year', default='2016,2017,2018', help='Years. Default: %(default)s')
    parser.add_argument('-c', '--channel', default='0L,1L,2L', help='Channel. Default: %(default)s')
    parser.add_argument(
        '--config-dir', default='/data/hqu/cmssw/nano-vh/CMSSW_11_1_0_pre5_PY3/src/PhysicsTools/NanoTrees/run/samples',
        help='Directory containing sample configs used to produce the trees. Default: %(default)s')
    args = parser.parse_args()

    years = args.year.split(',')
    channels = args.channel.split(',')
    chn_dict = {'0L': 'ZnnH', '1L': 'WH', '2L': 'ZllH'}
    for year in years:
        for chn in channels:
            opts = {
                'year': year,
                'channel': chn,
                'dc_dir': '{basedir}/VH/Cards/{year}/{chnname}'.format(basedir=args.basedir, year=year, chnname=chn_dict[chn]),
                'samp_config': os.path.join(args.config_dir, year, '%s_mc.yaml' % chn)
            }
            run(opts)
	import numpy as np
	import scipy.stats
	import uproot
	import awkward as ak
	import yaml
	import json
	import os
	import argparse


	def _get_variable_names(expr, exclude=['abs']):
	import ast
	root = ast.parse(expr)
	return sorted({node.id for node in ast.walk(root) if isinstance(node, ast.Name)} - set(exclude))


	def _pdfunc(arr):
	if len(arr) == 33:
	# PDF4LHC15_nnlo_30_pdfas
	delta = arr - arr[0]
	pdfunc = np.sqrt(np.sum(delta[1:31] ** 2))
	asunc = (arr[32] - arr[31]) / 2
	return np.sqrt(pdfunc2 + asunc2)
	elif len(arr) == 103:
	# NNPDF31_nnlo_hessian_pdfas
	delta = arr - arr[0]
	pdfunc = np.sqrt(np.sum(delta[1:101] ** 2))
	asunc = (arr[102] - arr[101]) / 2
	return np.sqrt(pdfunc2 + asunc2)
	elif len(arr) == 100:
	# NNPDF30_nlo_as_0118
	lo, hi = np.percentile(arr, [16, 84])
	return (hi - lo) / 2
	elif len(arr) == 101:
	# NNPDF30_lo_as_0118
	lo, hi = np.percentile(arr[1:], [16, 84])
	return (hi - lo) / 2
	elif len(arr) == 102:
	# NNPDF30_nlo_nf_5_pdfas
	lo, hi = np.percentile(arr[:100], [16, 84])
	pdfunc = (hi - lo) / 2
	asunc = (arr[101] - arr[100]) / 2
	return np.sqrt(pdfunc2 + asunc2)
	else:
	raise NotImplementedError


	def load_cfg(opts):
	with open(os.path.join(opts['dc_dir'], 'nominal/config.json')) as f:
	cfg = yaml.safe_load(f)
	# cfg: {samp -> {'file':..., 'weight':..., 'selections': {'$CAT_NAME': ...}}}

	with open(opts['samp_config']) as f:
	samp_cfg = yaml.safe_load(f)

	for samp in cfg:
	sname = os.path.basename(cfg[samp]['file']).replace('_tree.root', '')
	if len(samp_cfg[sname]) == 1:
	cfg[samp]['parts'] = [cfg[samp]['file']]
	else:
	cfg[samp]['parts'] = [os.path.join(
	os.path.dirname(cfg[samp]['file']), 'parts',
	(n[0] if isinstance(n, list) else n) + '_tree.root') for n in samp_cfg[sname]]

	return cfg


	def calc_unc(config, calc_pdf=True, calc_lep=True):
	# cat_name -> yields
	yields = {}
	pdfunc = {}
	lepunc = {}

	branches = _get_variable_names(config['weight'])
	for v in config['selections'].values():
	branches.extend(_get_variable_names(v))
	branches = set(branches)
	# print(branches)

	for p in config['parts']:
	# print(p)
	t = uproot.open(p)['Events']
	df = t.arrays(branches & set(t.keys()), library='pd')
	if len(df) == 0:
	# print('skipping empty file %s' % p)
	continue

	wgt = df.eval(config['weight'], engine='python')

	# ==================== nominal ====================
	catsels = {}
	for cat, cutstr in config['selections'].items():
	sel = df.eval(cutstr, engine='python')
	catsels[cat] = sel
	y = (wgt * sel).sum()
	if cat not in yields:
	yields[cat] = float(y)
	else:
	yields[cat] += float(y)

	# ==================== pdfunc ====================
	if calc_pdf:
	a = t.arrays(['LHEPdfWeight'])
	pdfwgt = a['LHEPdfWeight']

	num_pdfwgts = ak.to_numpy(ak.num(pdfwgt))
	if any(num_pdfwgts != num_pdfwgts[0]) or num_pdfwgts[0] == 0:
	# workaround for samples with different number of pdf wgts
	mode = scipy.stats.mode(num_pdfwgts)[0][0]
	print(p, 'num=', np.unique(num_pdfwgts), 'mode=', mode)
	if mode == 0:
	mode = 33
	pdfwgt = ak.fill_none(ak.pad_none(pdfwgt, mode, clip=True), 1)

	pdfwgt = ak.to_numpy(pdfwgt)
	for cat, cutstr in config['selections'].items():
	sel = catsels[cat]
	w = (wgt * sel).values
	ypdf = (w[:, None] * pdfwgt).sum(axis=0)
	if cat not in pdfunc:
	pdfunc[cat] = _pdfunc(ypdf)
	else:
	pdfunc[cat] += _pdfunc(ypdf)

	# ==================== lepunc ====================
	if calc_lep:
	a = t.arrays(['muEffWeight_UP', 'muEffWeight_DOWN', 'elEffWeight_UP', 'elEffWeight_DOWN'], library='np')
	for cat, cutstr in config['selections'].items():
	sel = catsels[cat]
	for utype in a.keys():
	wgtstr = config['weight'].replace(utype.split('_')[0], '1.0')
	y = (df.eval(wgtstr, engine='python') * a[utype] * sel).sum()

	if utype not in lepunc:
	lepunc[utype] = {}

	if cat not in lepunc[utype]:
	lepunc[utype][cat] = float(y)
	else:
	lepunc[utype][cat] += float(y)

	if calc_pdf:
	for k, v in pdfunc.items():
	pdfunc[k] = float(1 + v / yields[k] if yields[k] > 0 else 1.001)

	if calc_lep:
	for utype in lepunc:
	for k, v in lepunc[utype].items():
	lepunc[utype][k] = float(v / yields[k] if yields[k] > 0 else 1.0)

	return {
	'yields': yields,
	'pdfunc': pdfunc,
	'lepunc': lepunc,
	}


	def run(opts, validate=True):
	print('=' * 50)
	print(opts)

	results = {}
	cfg = load_cfg(opts)
	results = {k: calc_unc(v, calc_lep=opts['channel'] != '0L') for k, v in cfg.items()}
	results['_metadata'] = opts

	outpath = os.path.join(opts['dc_dir'], 'syst.json')
	with open(outpath, 'w') as fout:
	json.dump(results, fout, indent=2)
	print('Written output to %s' % outpath)

	if validate:
	with open(outpath) as f:
	rlts = yaml.safe_load(f)

	for samp in rlts:
	if 'yields' not in rlts[samp]:
	continue
	for cat, yields in rlts[samp]['yields'].items():
	fn = os.path.join(opts['dc_dir'], 'nominal/vhcc_%s.root' % cat)
	h = uproot.open(fn)[samp]
	tot = np.sum(h.to_numpy()[0])
	if abs(tot - yields) > 1e-4:
	print(cat, yields, tot, 'diff=', tot - yields)

	return results


	if __name__ == '__main__':
	parser = argparse.ArgumentParser('Calculate systematic uncertainties.')
	parser.add_argument('basedir', help='Base diretory.')
	parser.add_argument('-y', '--year', default='2016,2017,2018', help='Years. Default: %(default)s')
	parser.add_argument('-c', '--channel', default='0L,1L,2L', help='Channel. Default: %(default)s')
	parser.add_argument(
	'--config-dir', default='/data/hqu/cmssw/nano-vh/CMSSW_11_1_0_pre5_PY3/src/PhysicsTools/NanoTrees/run/samples',
	help='Directory containing sample configs used to produce the trees. Default: %(default)s')
	args = parser.parse_args()

	years = args.year.split(',')
	channels = args.channel.split(',')
	chn_dict = {'0L': 'ZnnH', '1L': 'WH', '2L': 'ZllH'}
	for year in years:
	for chn in channels:
	opts = {
	'year': year,
	'channel': chn,
	'dc_dir': '{basedir}/VH/Cards/{year}/{chnname}'.format(basedir=args.basedir, year=year, chnname=chn_dict[chn]),
	'samp_config': os.path.join(args.config_dir, year, '%s_mc.yaml' % chn)
	}
	run(opts)