Created
February 20, 2024 10:32
-
-
Save giorgiopizz/d10dd7f7ef722fa1cc24518ce5f15ea1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Main script for the creation of shapes, starting from a configuration folder. | |
It gives the option to compile the configuration folder and save it as both JSON and pickle file. | |
It also gives the option to run the analysis in batch mode, or to check for errors in the batch submission. | |
The analysis can be run in batch mode or locally. | |
If run in batch mode it gives the ability to merge the output root files. | |
""" | |
import sys | |
from pathlib import Path | |
import argparse | |
import os | |
import glob | |
import subprocess | |
import ROOT | |
ROOT.gROOT.SetBatch(True) | |
headersPath = os.path.dirname(os.path.dirname(__file__)) + "/include/headers.hh" | |
ROOT.gInterpreter.Declare(f'#include "{headersPath}"') | |
import numpy as np | |
# def hist2array(h, overflow=False, copy=True): | |
# shift = 0 | |
# if overflow: | |
# shift = 1 | |
# d = [] | |
# for i in range(1 - shift, h.GetNbinsX() + 1 + shift): | |
# d.append(h.GetBinContent(i)) | |
# d = np.array(d) | |
# return d | |
def hist2array(h, flow=False, copy=True, include_sumw2=False): | |
nx = h.GetNbinsX() + 2 | |
dtype=0 | |
if isinstance(h, ROOT.TH1D): | |
dtype=np.double | |
elif isinstance(h, ROOT.TH1F): | |
dtype=np.float | |
elif isinstance(h, ROOT.TH1I): | |
dtype=np.int | |
else: | |
print('Histogram of type', h, 'is not supperted', file=sys.stderr) | |
sys.exit(1) | |
vals = np.ndarray((nx, ), dtype=dtype, buffer=h.GetArray()) | |
sumw2 = np.ndarray((nx, ), dtype=dtype, buffer=h.GetSumw2().GetArray()) | |
shift = 1 | |
if flow: | |
shift = 0 | |
vals = vals[shift: nx-shift] | |
sum2 = sumw2[shift: nx-shift] | |
if copy: | |
vals = vals.copy() | |
sumw2 = sumw2.copy() | |
if include_sumw2: | |
return vals, sumw2 | |
else: | |
return vals | |
def defaultParser(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"-c", | |
"--compile", | |
type=int, | |
choices=[0, 1], | |
help="0 compile configuration and save JSON, 1 load compiled configuration", | |
required=False, | |
default=0, | |
) | |
parser.add_argument( | |
"-o", | |
"--operationMode", | |
type=int, | |
choices=[0, 1, 2], | |
help="0 do analysis in batch, 1 check for errors in batch submission, " | |
"2 hadd root files", | |
required=False, | |
default=-1, | |
) | |
parser.add_argument( | |
"-b", | |
"--doBatch", | |
help="0 (default) runs on local, 1 runs with condor", | |
required=False, | |
default="0", | |
) | |
parser.add_argument( | |
"-dR", | |
"--dryRun", | |
choices=[0, 1], | |
type=int, | |
help="1 do not submit to condor", | |
required=False, | |
default=0, | |
) | |
parser.add_argument( | |
"-f", "--folder", help="Path to folder", required=False, default="." | |
) | |
parser.add_argument( | |
"-configs", | |
"--configsFolder", | |
help="Path to configurations folder", | |
required=False, | |
default="configs", | |
) | |
parser.add_argument( | |
"-config", | |
"--configFile", | |
help="Path to configuration JSON file to load", | |
required=False, | |
default="latest", | |
) | |
parser.add_argument( | |
"-l", | |
"--limitEvents", | |
type=int, | |
help="Max number of events", | |
required=False, | |
default=-1, | |
) | |
parser.add_argument( | |
"-r", | |
"--resubmit", | |
choices=[0, 1, 2], | |
type=int, | |
help="Resubmit jobs, 1 resubmit finished jobs with errors, 2 resubmit running jobs", | |
required=False, | |
default="0", | |
) | |
return parser | |
def main(): | |
parser = defaultParser() | |
args = parser.parse_args() | |
compileFolder = args.compile | |
operationMode = args.operationMode | |
doBatch = int(args.doBatch) | |
dryRun = int(args.dryRun) | |
global folder | |
folder = os.path.abspath(args.folder) | |
configsFolder = os.path.abspath(args.folder + "/" + args.configsFolder) | |
configFile = args.configFile | |
resubmit = int(args.resubmit) | |
global batchFolder | |
global outputFolder | |
if compileFolder == 1: | |
print(os.getcwd()) | |
print(os.path.abspath(f"{folder}/configuration.py")) | |
if not os.path.exists(folder) or not os.path.exists( | |
f"{folder}/configuration.py" | |
): | |
print("Error, configuration folder does not exists!") | |
sys.exit() | |
from mkShapesRDF.shapeAnalysis.ConfigLib import ConfigLib | |
# variables before execution of files | |
configVars1 = dict(list(globals().items()) + list(locals().items())) | |
ConfigLib.loadConfig(["configuration.py"], globals()) | |
ConfigLib.loadConfig(filesToExec, globals(), imports) | |
globals()["varsToKeep"].insert(0, "folder") | |
d = ConfigLib.createConfigDict( | |
varsToKeep, dict(list(globals().items()) + list(locals().items())) | |
) | |
# variables after execution of files | |
configVars2 = dict(list(globals().items()) + list(locals().items())) | |
import datetime | |
dt = datetime.datetime.now() | |
Path(configsFolder).mkdir(parents=True, exist_ok=True) | |
fileName = configsFolder + "/config_" + dt.strftime("%y-%m-%d_%H_%M_%S") | |
fileNameJson = configsFolder + "/config" | |
ConfigLib.dumpConfigDict(d, fileName) | |
ConfigLib.dumpConfigDict(d, fileNameJson, doJson=True) | |
ConfigLib.loadDict(d, globals()) | |
else: | |
from mkShapesRDF.shapeAnalysis.ConfigLib import ConfigLib | |
if configFile != "latest": | |
p = os.path.abspath(configFile) | |
if not os.path.exists(p): | |
print("Config file", configFile, "doest not exists at path", p) | |
sys.exit() | |
else: | |
d = ConfigLib.loadPickle(p, globals()) | |
else: | |
d = ConfigLib.loadLatestPickle(configsFolder, globals()) | |
print(samples.keys()) | |
print(d.keys()) | |
print("\n\n", batchVars, "\n\n") | |
batchFolder = f"{folder}/{batchFolder}" | |
Path(f"{folder}/{outputFolder}").mkdir(parents=True, exist_ok=True) | |
if operationMode == 2 and os.path.exists(f"{folder}/{outputFolder}/{outputFile}"): | |
pass | |
# print("Can't merge files, output already exists") | |
# print(f"You can run: rm {folder}/{outputFolder}/{outputFile}") | |
# sys.exit() | |
limit = int(args.limitEvents) | |
# PROCESSING | |
if runnerFile == "default": | |
runnerPath = os.path.realpath(os.path.dirname(__file__)) + "/runner.py" | |
else: | |
runnerPath = f"{folder}/{runnerFile}" | |
print("\n\nRunner path: ", runnerPath, "\n\n") | |
if not os.path.exists(runnerPath): | |
print("Runner file / path does not exist!") | |
sys.exit() | |
_results = {} | |
sys.path.append(os.path.dirname(runnerPath)) | |
from runner import RunAnalysis | |
if operationMode == 0: | |
print("#" * 20, "\n\n", " Doing analysis", "\n\n", "#" * 20) | |
if doBatch == 1: | |
print("#" * 20, "\n\n", " Running on condor ", "\n\n", "#" * 20) | |
_samples = RunAnalysis.splitSamples(samples) | |
from mkShapesRDF.shapeAnalysis.BatchSubmission import BatchSubmission | |
outputPath = os.path.abspath(outputFolder) | |
batch = BatchSubmission( | |
outputPath, | |
batchFolder, | |
headersPath, | |
runnerPath, | |
tag, | |
_samples, | |
d, | |
batchVars, | |
) | |
batch.createBatches() | |
batch.submit(dryRun) | |
else: | |
print("#" * 20, "\n\n", " Running on local machine ", "\n\n", "#" * 20) | |
outputFileMap = f"{folder}/{outputFolder}/{outputFile}" | |
_samples = RunAnalysis.splitSamples(samples, False) | |
print(len(_samples)) | |
global cuts | |
runner = RunAnalysis( | |
_samples, | |
aliases, | |
variables, | |
cuts, | |
nuisances, | |
lumi, | |
limit, | |
outputFileMap, | |
) | |
runner.run() | |
elif operationMode == 1: | |
errs = glob.glob("{}/{}/*/err.txt".format(batchFolder, tag)) | |
files = glob.glob("{}/{}/*/script.py".format(batchFolder, tag)) | |
errsD = list(map(lambda k: "/".join(k.split("/")[:-1]), errs)) | |
filesD = list(map(lambda k: "/".join(k.split("/")[:-1]), files)) | |
# print(files) | |
notFinished = list(set(filesD).difference(set(errsD))) | |
print(notFinished) | |
tabulated = [] | |
tabulated.append(["Total jobs", "Finished jobs", "Running jobs"]) | |
tabulated.append([len(files), len(errs), len(notFinished)]) | |
import tabulate | |
print(tabulate.tabulate(tabulated)) | |
# print('queue 1 Folder in ' + ' '.join(list(map(lambda k: k.split('/')[-1], notFinished)))) | |
normalErrs = """Warning in <TClass::Init>: no dictionary for class edm::ProcessHistory is available | |
Warning in <TClass::Init>: no dictionary for class edm::ProcessConfiguration is available | |
Warning in <TClass::Init>: no dictionary for class edm::ParameterSetBlob is available | |
Warning in <TClass::Init>: no dictionary for class edm::Hash<1> is available | |
Warning in <TClass::Init>: no dictionary for class pair<edm::Hash<1>,edm::ParameterSetBlob> is available | |
Warning in <TInterpreter::ReadRootmapFile>: class podio:: | |
TClass::Init:0: RuntimeWarning: | |
real | |
user | |
sys | |
run locally | |
No TTree was created for | |
Warning in <Snapshot>: A lazy Snapshot action was booked but never triggered. | |
cling::DynamicLibraryManager::loadLibrary(): libOpenGL.so.0: cannot open shared object file: No such file or directory | |
Error in <AutoloadLibraryMU>: Failed to load library /cvmfs/sft.cern.ch/lcg/releases/ROOT/6.28.00 | |
cling::DynamicLibraryManager::loadLibrary(): libGLU.so.1: cannot open shared object file: No such file or directory | |
Error in <TNetXNGFile::Close>: | |
Warning in <TChain::AddFile>: Adding tree with no entries from file | |
Plugin No such file or directory loading sec.protocol libXrdSecztn.so | |
""" | |
normalErrs = normalErrs.split("\n") | |
normalErrs = list(map(lambda k: k.strip(" ").strip("\t"), normalErrs)) | |
normalErrs = list(filter(lambda k: k != "", normalErrs)) | |
toResubmit = [] | |
def normalErrsF(k): | |
for s in normalErrs: | |
if s in k: | |
return True | |
elif k.startswith(s): | |
return True | |
return False | |
for err in errs: | |
try: | |
with open(err) as file: | |
lines = file.read() | |
except: | |
print('Could not open error file for', err, 'continue') | |
toResubmit.append(err) | |
continue | |
txt = lines.split("\n") | |
# txt = list(filter(lambda k: k not in normalErrs, txt)) | |
txt = list(filter(lambda k: not normalErrsF(k), txt)) | |
txt = list(filter(lambda k: k.strip() != "", txt)) | |
if len(txt) > 0: | |
print("Found unusual error in") | |
print(err) | |
print("\n") | |
print("\n".join(txt[:5] + ['\n...\n'] + txt[-5:])) | |
print("\n\n") | |
toResubmit.append(err) | |
toResubmit = list(map(lambda k: "".join(k.split("/")[-2]), toResubmit)) | |
print(toResubmit) | |
if len(toResubmit) > 0: | |
print("\n\nShould resubmit the following files\n") | |
print( | |
"queue 1 Folder in " | |
+ " ".join(list(map(lambda k: k.split("/")[-1], toResubmit))) | |
) | |
if resubmit == 1: | |
from mkShapesRDF.shapeAnalysis.BatchSubmission import BatchSubmission | |
BatchSubmission.resubmitJobs(batchFolder, tag, toResubmit, dryRun) | |
if resubmit == 2: | |
# resubmit all the jobs that are not finished | |
toResubmit = list(map(lambda k: "".join(k.split("/")[-1]), notFinished)) | |
print(toResubmit) | |
from mkShapesRDF.shapeAnalysis.BatchSubmission import BatchSubmission | |
BatchSubmission.resubmitJobs(batchFolder, tag, toResubmit, dryRun) | |
elif operationMode == 2: | |
print( | |
"", | |
"".join(["#" for _ in range(20)]), | |
"\n\n", | |
"Merging root files", | |
"\n\n", | |
"".join(["#" for _ in range(20)]), | |
) | |
_samples = RunAnalysis.splitSamples(samples) | |
print(_samples) | |
__samples = list(map(lambda k: k[0] + '_' + str(k[3]), _samples)) | |
#__samples = list(set(__samples).difference(['DY_23', 'DY_19']) ) | |
print(len(__samples)) | |
outputFileTrunc = ".".join(outputFile.split(".")[:-1]) | |
filesToMerge = list( | |
map( | |
#lambda k: f"{folder}/{outputFolder}/{outputFileTrunc}__ALL__{k[0]}_{str(k[3])}.root", | |
lambda k: f"{folder}/{outputFolder}/{outputFileTrunc}__ALL__{k}.root", | |
__samples, | |
) | |
) | |
print("\n\nMerging files\n\n") | |
print("\n\n", filesToMerge, "\n\n") | |
print(f"Hadding files into {folder}/{outputFolder}/{outputFile}") | |
ret = subprocess.run( | |
f'hadd -j {folder}/{outputFolder}/{outputFile} {" ".join(filesToMerge)}', | |
shell=True, | |
) | |
# if True: | |
if ret.returncode == 0: | |
# ok = True | |
# if ok: | |
print("Hadd was successful") | |
import mkShapesRDF.shapeAnalysis.latinos.LatinosUtils as utils | |
f = ROOT.TFile.Open(f"{outputFolder}/{outputFile}", "update") | |
# post process -> nuisance envelops and RMS | |
cuts = cuts['cuts'] | |
categoriesmap = utils.flatten_cuts(cuts) | |
for nuisance in nuisances.keys(): | |
if not ( | |
nuisances[nuisance].get("kind", "").endswith("envelope") | |
or nuisances[nuisance].get("kind", "").endswith("rms") | |
or nuisances[nuisance].get("kind", "").endswith("square") | |
): | |
continue | |
print('work for ', nuisance) | |
#categoriesmap = utils.flatten_cuts(cuts) | |
subsamplesmap = utils.flatten_samples(samples) | |
utils.update_variables_with_categories(variables, categoriesmap) | |
utils.update_nuisances_with_subsamples(nuisances, subsamplesmap) | |
utils.update_nuisances_with_categories(nuisances, categoriesmap) | |
# sys.exit() | |
_cuts = list(cuts.keys()) | |
_samples = list(samples.keys()) | |
# print(cuts['cuts']) | |
# print(variables) | |
# print(utils.flatten_cuts(cuts['cuts'])) | |
for cut in _cuts: | |
# print('work in', cut) | |
# ROOT.gDirectory.cd(f"/{cut}") | |
# for variable in [k.GetName() for k in ROOT.gDirectory.GetListOfKeys()]: | |
for variable in variables.keys(): | |
f.cd(f"/{cut}/{variable}") | |
print('work in ', cut, variable) | |
histos = [k.GetName() for k in ROOT.gDirectory.GetListOfKeys()] | |
for sampleName in _samples: | |
limitSamples = nuisances[nuisance].get('samples', {}) | |
if not (len(limitSamples.keys()) == 0 or sampleName in limitSamples.keys()): | |
# print(f'{sampleName} does not have nuisance {nuisance}') | |
continue | |
histosNameToProcess = list( | |
filter( | |
lambda k: k.startswith( | |
f"histo_{sampleName}_{nuisances[nuisance]['name']}_SPECIAL_NUIS" | |
), | |
histos, | |
) | |
) | |
histosToProcess = list( | |
map( | |
lambda k: ROOT.gDirectory.Get(k).Clone(), | |
histosNameToProcess, | |
) | |
) | |
if len(histosToProcess) == 0: | |
print(f'No variations found for {sampleName} in {cut}/{variable} for nuisance {nuisances[nuisance]["name"]}', file=sys.stderr,) | |
continue | |
sys.exit(1) | |
hNominal = ROOT.gDirectory.Get( | |
f"histo_{sampleName}" | |
).Clone() | |
hName = f"histo_{sampleName}_{nuisances[nuisance]['name']}" | |
h_up = histosToProcess[0].Clone() | |
h_do = histosToProcess[0].Clone() | |
variations = np.empty( | |
( | |
len(histosToProcess), | |
histosToProcess[0].GetNbinsX() + 2, | |
), | |
dtype=float, | |
) | |
for i in range(len(histosToProcess)): | |
variations[i, :] = hist2array( | |
histosToProcess[i], flow=True | |
) | |
arrup = 0 | |
arrdo = 0 | |
if nuisances[nuisance]['kind'].endswith("envelope"): | |
arrup = np.max(variations, axis=0) | |
arrdo = np.min(variations, axis=0) | |
elif nuisances[nuisance]['kind'].endswith("rms"): | |
vnominal = hist2array(hNominal, flow=True) | |
arrnom = np.tile(vnominal, (variations.shape[0], 1)) | |
arrv = np.sqrt( | |
np.mean(np.square(variations - arrnom), axis=0) | |
) | |
arrup = vnominal + arrv | |
arrdo = vnominal - arrv | |
elif nuisances[nuisance]['kind'].endswith("square"): | |
vnominal = hist2array(hNominal, flow=True) | |
arrnom = np.tile(vnominal, (variations.shape[0], 1)) | |
up_is_up = variations>arrnom | |
zeros = np.zeros_like(arrnom) | |
arrv = np.sqrt( | |
np.sum(np.square(variations - arrnom), axis=0) | |
) | |
arrup = vnominal + arrv | |
arrdo = vnominal - arrv | |
# arrup = np.where(up_is_up, vnominal + arrv, vnominal - arrv) | |
# arrdo = np.where(~up_is_up, vnominal - arrv, vnominal + arrv) | |
else: | |
continue | |
print(arrup) | |
print(arrdo) | |
for i in range(0, h_up.GetNbinsX() + 2): | |
h_up.SetBinContent(i, arrup[i]) | |
h_do.SetBinContent(i, arrdo[i]) | |
print(hName) | |
h_up.SetName(hName + "Up") | |
h_up.Write() | |
h_do.SetName(hName + "Down") | |
h_do.Write() | |
for histo in histosNameToProcess: | |
ROOT.gDirectory.Delete(f"{histo};*") | |
f.Close() | |
else: | |
print("Operating mode was set to -1, nothing was done") | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from copy import deepcopy | |
import sys | |
import ROOT | |
from array import array | |
from mkShapesRDF.lib.parse_cpp import ParseCpp | |
import os | |
from math import sqrt | |
ROOT.gROOT.SetBatch(True) | |
ROOT.TH1.SetDefaultSumw2(True) | |
import numpy as np | |
# def hist2array(h, overflow=False, copy=True): | |
# shift = 0 | |
# if overflow: | |
# shift = 1 | |
# d = [] | |
# for i in range(1 - shift, h.GetNbinsX() + 1 + shift): | |
# d.append(h.GetBinContent(i)) | |
# d = np.array(d) | |
# return d | |
def hist2array(h, flow=False, copy=True, include_sumw2=True): | |
nx = h.GetNbinsX() + 2 | |
dtype=0 | |
if isinstance(h, ROOT.TH1D): | |
dtype=np.double | |
elif isinstance(h, ROOT.TH1F): | |
dtype=np.float | |
elif isinstance(h, ROOT.TH1I): | |
dtype=np.int | |
else: | |
print('Histogram of type', h, 'is not supperted', file=sys.stderr) | |
sys.exit(1) | |
vals = np.ndarray((nx, ), dtype=dtype, buffer=h.GetArray()) | |
sumw2 = np.ndarray((nx, ), dtype=dtype, buffer=h.GetSumw2().GetArray()) | |
shift = 0 | |
if flow: | |
shift = 1 | |
vals = vals[shift: nx-shift] | |
sum2 = sumw2[shift: nx-shift] | |
if copy: | |
vals = vals.copy() | |
sumw2 = sumw2.copy() | |
if include_sumw2: | |
return vals, sumw2 | |
else: | |
return vals | |
def _fold(_h, _to, _from): | |
_h.SetBinContent(_to, _h.GetBinContent(_to) + _h.GetBinContent(_from)) | |
sumw2 = _h.GetSumw2() | |
#_h.SetBinError(_to, sqrt(_h.GetBinError(_to) **2 + _h.GetBinError(_from)**2)) | |
sumw2[_to] = sumw2[_to] + sumw2[_from] | |
_h.SetBinContent(_from, 0.0) | |
sumw2[_from] = 0.0 | |
#_h.SetBinError(_from, 0.0) | |
class RunAnalysis: | |
r"""Class athat craeates ``dfs`` and runs the analysiss""" | |
@staticmethod | |
def splitSamples(samples, useFilesPerJob=True): | |
r"""static methods, takes a dictionary of samples and split them based on their weights and max num. of files | |
Parameters | |
---------- | |
samples : dict | |
dictionary of samples | |
useFilesPerJob : bool, optional, default: True | |
if you want to further split the samples based on max num. of files. | |
Returns | |
------- | |
`list of tuple` | |
each tuple will have a lenght of 5 (6 if subsamples are present), where the first element is the name of the sample, the second the list of files, the third the weight, and the fourth the index of this tuple compared to the other tuples of the same sample type, the fifth will be the isData flag (True if the sample is data, False otherwise). If subsamples are present, the sixth element will be the dict of subsamples | |
""" | |
# will contain all the different samples splitted based on their weights and max num. of files | |
splittedSamples = [] | |
for sampleName in list(samples.keys()): | |
types = ( | |
{} | |
) # this will contain the the different type of files with their special weights for this sampleName | |
# recall that samples[sampleName]['name'] contain a list of tuples, where a single tuple can have a size of 2 or 3: | |
# first position for name of the process (DYJetsToLL_M-50), second list of files, third a possible special weight | |
for filesType in samples[sampleName]["name"]: | |
# no special weight (will use '1'), and at least one file found | |
if len(filesType) == 2 and len(filesType[1]) > 0: | |
if "1" not in list(types.keys()): | |
types["1"] = [] | |
types["1"].append(filesType + ("1.0",)) | |
elif len(filesType) == 3 and len(filesType[1]) > 0: | |
if filesType[2] not in list(types.keys()): | |
types[filesType[2]] = [] | |
types[filesType[2]].append(filesType) | |
else: | |
print("Error", sampleName, filesType, file=sys.stderr) | |
print( | |
"Either the sample proc you specified has no files, or the weight had problems", | |
file=sys.stderr, | |
) | |
sys.exit() | |
i = 0 | |
for _type in list(types.keys()): | |
# split files based on FilesPerJob or include them all | |
__files = list(map(lambda k: k[1], types[_type])) | |
# flatted list of files | |
__files = [item for sublist in __files for item in sublist] | |
if useFilesPerJob: | |
dim = samples[sampleName].get("FilesPerJob", len(__files)) | |
else: | |
dim = len(__files) | |
__files = [__files[j : j + dim] for j in range(0, len(__files), dim)] | |
for ___files in __files: | |
# the weights for these files will be the product of the weight inside this sampele (i.e. samples[sampleName]['weight']) | |
# and the product of the special weights that is in common to all of those files (i.e. sampleType[0]) | |
# the common special weight can be retrived from the first of the list of files with this weight | |
# remember that the tuple has always size 3 now, the last position is for the special weight | |
weight = ( | |
"( " | |
+ samples[sampleName]["weight"] | |
+ " ) * ( " | |
+ types[_type][0][2] | |
+ " )" | |
) | |
isData = samples[sampleName].get("isData", False) | |
sampleType = (sampleName, ___files, weight, i, isData) | |
if "subsamples" in list(samples[sampleName].keys()): | |
sampleType += (samples[sampleName]["subsamples"],) | |
splittedSamples.append(sampleType) | |
i += 1 | |
return splittedSamples | |
@staticmethod | |
def getTTreeNomAndFriends(fnom, friends): | |
"""Create a TChain with the nominal files and the friends files (nuisances TTrees with varied branches) | |
Args: | |
fnom (list): list of nominal files | |
friends (list of list): list of list of friends files | |
Returns: | |
TChain: TChain with nominal files and friends files | |
""" | |
tnom = ROOT.TChain("Events") | |
for f in fnom: | |
tnom.Add(f) | |
for friend in friends: | |
_tfriend = ROOT.TChain("Events") | |
for f in friend: | |
_tfriend.Add(f) | |
tnom.AddFriend(_tfriend) | |
return tnom | |
@staticmethod | |
def getNuisanceFiles(nuisance, files): | |
"""Searches in the provided nuisance folder for the files with the same name of the nominal files | |
Args: | |
nuisance (dict): dict with the nuisance information | |
files (list): list of nominal files | |
Returns: | |
list of list: list with the down and up varied list of files | |
""" | |
_files = list(map(lambda k: k.split("/")[-1], files)) | |
nuisanceFilesDown = list( | |
map(lambda k: nuisance["folderDown"] + "/" + k, _files) | |
) | |
nuisanceFilesUp = list(map(lambda k: nuisance["folderUp"] + "/" + k, _files)) | |
for nuisFile in nuisanceFilesUp + nuisanceFilesDown: | |
# print(nuisFile) | |
if not os.path.exists(nuisFile): | |
print("File", nuisFile, "does not exist", file=sys.stderr) | |
sys.exit(1) | |
return [nuisanceFilesDown, nuisanceFilesUp] | |
@staticmethod | |
def index_sub(string, sub): | |
try: | |
return string.index(sub) | |
except ValueError: | |
return -10000 | |
def __init__( | |
self, | |
samples, | |
aliases, | |
variables, | |
cuts, | |
nuisances, | |
lumi, | |
limit=-1, | |
outputFileMap="output.root", | |
): | |
r""" | |
Stores arguments in the class attributes and creates all the RDataFrame objects | |
Parameters | |
---------- | |
samples : `list of tuple` | |
same type as the return of the splitSamples method | |
aliases : dict | |
dict of aliases | |
variables : dict | |
dict of variables | |
cuts : dict | |
dict of cuts, contains two keys (preselections: str, cuts: dict) | |
nuisances : dict | |
dict of nuisances | |
lumi : float | |
lumi in fb-1 | |
limit : int, optional, default: -1 | |
limit of events to be processed | |
outputFileMap : str, optional, defaults: 'output.root' | |
full path + filename of the output root file. | |
Returns | |
------- | |
None | |
""" | |
self.samples = samples | |
self.aliases = aliases | |
self.variables = variables | |
self.preselections = cuts["preselections"] | |
mergedCuts = {} | |
for cut in list(cuts["cuts"].keys()): | |
__cutExpr = "" | |
if type(cuts["cuts"][cut]) == dict: | |
__cutExpr = cuts["cuts"][cut]["expr"] | |
for cat in list(cuts["cuts"][cut]["categories"].keys()): | |
mergedCuts[cut + "_" + cat] = {"parent": cut} | |
mergedCuts[cut + "_" + cat]["expr"] = ( | |
__cutExpr + " && " + cuts["cuts"][cut]["categories"][cat] | |
) | |
elif type(cuts["cuts"][cut]) == str: | |
__cutExpr = cuts["cuts"][cut] | |
mergedCuts[cut] = {"expr": __cutExpr, "parent": cut} | |
self.cuts = mergedCuts | |
self.nuisances = nuisances | |
self.lumi = lumi | |
self.limit = limit | |
self.outputFileMap = outputFileMap | |
self.remappedVariables = {} | |
self.dfs = {} | |
r""" | |
dfs is a dictionary containing as keys the sampleNames. The structure should look like this: | |
.. code-block:: python | |
dfs = { | |
'DY':{ | |
0: { | |
'df': obj, | |
'columnNames': [...], | |
'usedVariables': [...], | |
'ttree': obj2, # needed otherwise seg fault in root | |
}, | |
} | |
} | |
""" | |
usedVariables = set() | |
usedVariables = usedVariables.union( | |
ParseCpp.listOfVariables(ParseCpp.parse(self.preselections)) | |
) | |
for cut in mergedCuts.keys(): | |
usedVariables = usedVariables.union( | |
ParseCpp.listOfVariables(ParseCpp.parse(mergedCuts[cut]["expr"])) | |
) | |
for var in variables.keys(): | |
if "name" in variables[var].keys(): | |
__var = variables[var]["name"].split(":") | |
__vars = list( | |
map(lambda k: ParseCpp.listOfVariables(ParseCpp.parse(k)), __var) | |
) | |
for __v in __vars: | |
usedVariables = usedVariables.union(__v) | |
elif "tree" in variables[var].keys(): | |
for branch in variables[var]["tree"].keys(): | |
usedVariables = usedVariables.union( | |
ParseCpp.listOfVariables( | |
ParseCpp.parse(variables[var]["tree"][branch]) | |
) | |
) | |
else: | |
print("Cannot process variable ", var, " nuisances might be faulty") | |
# sample here is a tuple, first el is the sampleName, second list of files, | |
# third the special weight, forth is the index of tuple for the same sample, | |
# fifth if present the dict of subsamples | |
for sample in samples: | |
files = sample[1] | |
sampleName = sample[0] | |
friendsFiles = [] | |
usedFolders = [] | |
for nuisance in self.nuisances.values(): | |
if sampleName not in nuisance.get("samples", {sampleName: []}): | |
continue | |
if nuisance.get("type", "") == "shape": | |
if nuisance.get("kind", "") == "suffix": | |
if nuisance.get("folderUp", "") != "": | |
if nuisance["folderUp"] in usedFolders: | |
continue | |
usedFolders.append(nuisance["folderUp"]) | |
friendsFiles += RunAnalysis.getNuisanceFiles( | |
nuisance, files | |
) | |
tnom = RunAnalysis.getTTreeNomAndFriends(files, friendsFiles) | |
if limit != -1: | |
df = ROOT.RDataFrame(tnom) | |
df = df.Range(limit) | |
else: | |
#ROOT.EnableImplicitMT() | |
df = ROOT.RDataFrame(tnom) | |
if sampleName not in self.dfs.keys(): | |
self.dfs[sample[0]] = {} | |
self.dfs[sampleName][sample[3]] = { | |
"df": df, | |
"ttree": tnom, | |
"usedVariables": list(usedVariables), | |
} | |
self.dfs[sampleName][sample[3]]["columnNames"] = list( | |
map(lambda k: str(k), df.GetColumnNames()) | |
) | |
self.definedAliases = {} | |
print("\n\nLoaded dataframes\n\n") | |
def loadAliases(self, afterNuis=False): | |
""" | |
Load aliases in the dataframes. It does not create the special alias ``weight`` for which a special method is used. | |
Parameters | |
---------- | |
afterNuis : bool, optional, default: False | |
if True, only aliases with the key ``afterNuis`` set to True will be loaded | |
""" | |
aliases = self.aliases | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
df = self.dfs[sampleName][index]["df"] | |
define_string = "df" | |
usedVariables = set(self.dfs[sampleName][index]["usedVariables"]) | |
for alias in list(aliases.keys()): | |
# only load aliases needed for nuisances! | |
# if beforeNuis | |
if (afterNuis) != (aliases[alias].get("afterNuis", False)): | |
if "expr" in aliases[alias]: | |
usedVariables = usedVariables.union( | |
ParseCpp.listOfVariables( | |
ParseCpp.parse(aliases[alias]["expr"]) | |
) | |
) | |
continue | |
if "samples" in list(aliases[alias]): | |
if sampleName not in aliases[alias]["samples"]: | |
continue | |
if alias in self.dfs[sampleName][index]["columnNames"]: | |
if alias != aliases[alias].get("expr", ""): | |
print( | |
f"Alias {alias} cannot be defined, column with that name already exists" | |
) | |
sys.exit() | |
else: | |
# since the alias key is equal to expr there's no need to keep the alias | |
del self.aliases[alias] | |
continue | |
for line in aliases[alias].get("linesToAdd", []): | |
# RPLME_nThreads is used to set the number of threads | |
ROOT.gInterpreter.Declare( | |
line.replace("RPLME_nThreads", str(df.GetNSlots())) | |
) | |
for line in aliases[alias].get("linesToProcess", []): | |
# RPLME_nThreads is used to set the number of threads | |
exec(line, globals()) | |
if "expr" in list(aliases[alias].keys()): | |
define_string += ( | |
f".Define('{alias}', '{aliases[alias]['expr']}') \\\n\t" | |
) | |
#print(define_string) | |
usedVariables = usedVariables.union( | |
ParseCpp.listOfVariables( | |
ParseCpp.parse(aliases[alias]["expr"]) | |
) | |
) | |
elif "exprSlot" in list(aliases[alias].keys()): | |
args = aliases[alias]["exprSlot"] | |
args[0] = args[0].replace("RPLME_nThreads", str(df.GetNSlots())) | |
define_string += ( | |
f".DefineSlot('{alias}', '{args[0]}', [ {args[1]} ]) \\\n\t" | |
) | |
usedVariables = usedVariables.union( | |
ParseCpp.listOfVariables( | |
ParseCpp.parse(aliases[alias]["exprSlot"][1]) | |
) | |
) | |
elif "class" in list(aliases[alias].keys()): | |
define_string += f".Define('{alias}', '{aliases[alias]['class']} ( {aliases[alias].get('args', '')} )') \\\n\t" | |
df1 = eval(define_string) | |
self.dfs[sampleName][index]["df"] = df1 | |
stringPrint = "before nuisances" | |
if afterNuis: | |
stringPrint = "after nuisances" | |
print( | |
f"\n\nLoaded all aliases {stringPrint} for {sampleName} index {index}\n\n" | |
) | |
self.dfs[sampleName][index]["usedVariables"] = list(usedVariables) | |
self.dfs[sampleName][index]["columnNames"] = list( | |
map(lambda k: str(k), df1.GetColumnNames()) | |
) | |
def loadAliasWeight(self): | |
""" | |
Loads only the special alias ``weight`` in the dataframes. | |
""" | |
samples = self.samples | |
aliases = self.aliases | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
df = self.dfs[sampleName][index]["df"] # noqa: F841 | |
# Define the most importante alias: the weight! | |
sample = list( | |
filter(lambda k: k[0] == sampleName and k[3] == index, samples) | |
)[0] | |
weight = sample[2] | |
isData = sample[4] | |
if not isData: | |
aliases["weight"] = {"expr": str(self.lumi) + " * " + weight} | |
else: | |
aliases["weight"] = {"expr": weight} | |
print("\n\n", sampleName, "\n\n", aliases["weight"]) | |
# load the alias weight | |
define_string = "df" | |
alias = "weight" | |
if alias in self.dfs[sampleName][index]["columnNames"]: | |
print( | |
f"Alias {alias} cannot be defined, column with that name already exists" | |
) | |
sys.exit() | |
define_string += ( | |
f".Define('{alias}', '{aliases[alias]['expr']}') \\\n\t" | |
) | |
df1 = eval(define_string) | |
self.dfs[sampleName][index]["df"] = df1 | |
print(f"\n\nLoaded alias weight for {sampleName} index {index}\n\n") | |
self.dfs[sampleName][index]["columnNames"].append("weight") | |
def loadSystematicsSuffix(self): | |
""" | |
Loads systematics of type ``suffix`` in the dataframes. | |
""" | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
df = self.dfs[sampleName][index]["df"] | |
columnNames = self.dfs[sampleName][index]["columnNames"] | |
nuisances = self.nuisances | |
# nuisance key is not used | |
for _, nuisance in list(nuisances.items()): | |
if sampleName not in nuisance.get("samples", {sampleName: []}): | |
continue | |
if nuisance.get("type", "") == "shape": | |
if nuisance.get("kind", "") == "suffix": | |
variation = nuisance["mapDown"] | |
separator = nuisance.get("separator", "_") | |
variedCols = list( | |
filter(lambda k: k.endswith(variation), columnNames) | |
) | |
if len(variedCols) == 0: | |
print(f"No varied columns for {variation}") | |
sys.exit() | |
# if 'JER' in nuisance['name']: | |
# print(variedCols) | |
baseCols = list( | |
map( | |
lambda k: k[ | |
RunAnalysis.index_sub(k, "Events.") | |
+ len("Events.") : -len(separator + variation) | |
] | |
if "Events" in k | |
else k[: -len(separator + variation)], | |
variedCols, | |
) | |
) | |
for baseCol in baseCols: | |
if ( | |
baseCol | |
not in self.dfs[sampleName][index]["usedVariables"] | |
): | |
# baseCol is never used -> useless to register variation | |
print("unused variable", baseCol) | |
continue | |
print(baseCol in [str(k) for k in df.GetColumnNames()]) | |
if not (baseCol in [str(k) for k in df.GetColumnNames()]): | |
continue | |
if "bool" not in str(df.GetColumnType(baseCol)).lower(): | |
varNameDown = ( | |
baseCol | |
+ separator | |
+ nuisance["mapDown"] | |
+ "*" | |
+ nuisance["samples"][sampleName][1] | |
) | |
varNameUp = ( | |
baseCol | |
+ separator | |
+ nuisance["mapUp"] | |
+ "*" | |
+ nuisance["samples"][sampleName][0] | |
) | |
else: | |
varNameDown = ( | |
baseCol + separator + nuisance["mapDown"] | |
) | |
varNameUp = baseCol + separator + nuisance["mapUp"] | |
_type = df.GetColumnType(baseCol) | |
expr = ( | |
ParseCpp.RVecExpression(_type) | |
+ "{" | |
+ f"{varNameDown}, {varNameUp}" | |
+ "}" | |
) | |
if "JER" in nuisance["name"]: | |
print("JER expr", expr) | |
df = df.Vary( | |
baseCol, | |
expr, | |
variationTags=["Down", "Up"], | |
variationName=nuisance["name"], | |
) | |
else: | |
continue | |
# else: | |
# print("Unsupported nuisance") | |
# sys.exit() | |
self.dfs[sampleName][index]["df"] = df | |
def loadSystematicsReweights(self): | |
""" | |
Loads systematics of type ``suffix`` in the dataframes. | |
""" | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
df = self.dfs[sampleName][index]["df"] | |
columnNames = self.dfs[sampleName][index]["columnNames"] | |
nuisances = self.nuisances | |
# nuisance key is not used | |
for _, nuisance in list(nuisances.items()): | |
if sampleName not in nuisance.get("samples", {sampleName: []}): | |
continue | |
if nuisance.get("type", "") == "shape": | |
if nuisance.get("kind", "") == "weight": | |
weights = nuisance["samples"].get(sampleName, None) | |
if weights is not None: | |
variedNames = [] | |
if weights[0] not in columnNames: | |
df = df.Define(nuisance["name"] + "_up", weights[0]) | |
variedNames.append(nuisance["name"] + "_up") | |
else: | |
variedNames.append(weights[0]) | |
if weights[1] not in columnNames: | |
df = df.Define( | |
nuisance["name"] + "_down", weights[1] | |
) | |
variedNames.append(nuisance["name"] + "_down") | |
else: | |
variedNames.append(weights[1]) | |
if df.GetColumnType("weight") == "double": | |
expr = ( | |
"ROOT::RVecD" | |
+ "{ weight * (double)" | |
+ f"{variedNames[1]}, weight * (double) {variedNames[0]}" | |
+ "}" | |
) | |
elif df.GetColumnType("weight") == "float": | |
expr = ( | |
"ROOT::RVecF" | |
+ "{ weight * (float)" | |
+ f"{variedNames[1]}, weight * (float) {variedNames[0]}" | |
+ "}" | |
) | |
else: | |
print( | |
"Weight column has unknown type:", | |
df.GetColumnType("weight"), | |
"while varied is: ", | |
df.GetColumnType(variedNames[1]), | |
) | |
sys.exit() | |
df = df.Vary( | |
"weight", | |
expr, | |
variationTags=["Down", "Up"], | |
variationName=nuisance["name"], | |
) | |
else: | |
continue | |
# else: | |
# print("Unsupported nuisance") | |
# sys.exit() | |
self.dfs[sampleName][index]["df"] = df | |
def loadSystematicsReweightsEnvelopeRMS(self): | |
""" | |
Loads systematics of type ``weight_envelope`` or type ``weight_rms`` in the dataframes. | |
""" | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
df = self.dfs[sampleName][index]["df"] | |
columnNames = self.dfs[sampleName][index]["columnNames"] | |
nuisances = self.nuisances | |
# nuisance key is not used | |
for _, nuisance in list(nuisances.items()): | |
if sampleName not in nuisance.get("samples", {sampleName: []}): | |
continue | |
if nuisance.get("type", "") == "shape": | |
if ( | |
nuisance.get("kind", "") == "weight_envelope" | |
or nuisance.get("kind", "") == "weight_rms" | |
or nuisance.get("kind", "") == "weight_square" | |
): | |
weights = nuisance["samples"].get(sampleName, None) | |
if weights is not None: | |
variedNames = [] | |
variedTags = [] | |
for i, weight in enumerate(weights): | |
if weight not in columnNames: | |
nuisName = f'{nuisance["name"]}_{i}' | |
df = df.Define(nuisName, weight) | |
variedNames.append(nuisName) | |
variedTags.append(str(i)) | |
else: | |
variedNames.append(weight) | |
variedTags.append(str(i)) | |
if df.GetColumnType("weight") == "double": | |
expr = ( | |
"ROOT::RVecD" | |
+ "{ " | |
+ ", ".join( | |
[ | |
f"weight * (double) {variedName}" | |
for variedName in variedNames | |
] | |
) | |
+ "}" | |
) | |
elif df.GetColumnType("weight") == "float": | |
expr = ( | |
"ROOT::RVecF" | |
+ "{ " | |
+ ", ".join( | |
[ | |
f"weight * (float) {variedName}" | |
for variedName in variedNames | |
] | |
) | |
+ "}" | |
) | |
else: | |
print( | |
"Weight column has unknown type:", | |
df.GetColumnType("weight"), | |
"while varied is: ", | |
df.GetColumnType(variedNames[0]), | |
) | |
sys.exit() | |
df = df.Vary( | |
"weight", | |
expr, | |
variationTags=variedTags, | |
variationName=nuisance["name"] | |
+ "_SPECIAL_NUIS_" | |
+ nuisance["kind"].split("_")[-1], | |
) | |
else: | |
continue | |
self.dfs[sampleName][index]["df"] = df | |
def loadVariables(self): | |
""" | |
Loads variables (not the ones with the 'tree' key in them), | |
and checks if they are already in the dataframe columns, | |
if so it adds ``__`` at the beginning of the name. | |
Since variables are shared but not the aliases, | |
it could happen that a variable's name or expression is already defined | |
for a given df but not for another one -> | |
need to determine a common and compatible set of variables for all the many dfs. | |
This is done by gathering the largest set of column names. | |
""" | |
bigColumnNames = set() | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
columnNames = self.dfs[sampleName][index]["columnNames"] | |
bigColumnNames = bigColumnNames.union(set(columnNames)) | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
for var in list(self.variables.keys()): | |
if "tree" in self.variables[var].keys(): | |
continue | |
if var in bigColumnNames and var != self.variables[var]["name"]: | |
# here I want to define a variable whose key is already in the column name list | |
# and its expression is different from its name | |
# therefore we will either create a Define or an Alias | |
# need to rename the new variable! | |
prefix = "__" | |
nVar = prefix + var | |
while nVar in bigColumnNames: | |
prefix += "__" | |
nVar = prefix + var | |
# print("changing variable", var, "to " + nVar) | |
self.remappedVariables[nVar] = prefix | |
self.variables[nVar] = deepcopy(self.variables[var]) | |
del self.variables[var] | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
df = self.dfs[sampleName][index]["df"] | |
for var in list(self.variables.keys()): | |
if "tree" in self.variables[var].keys(): | |
continue | |
for i, _var in enumerate(self.variables[var]["name"].split(":")): | |
n = var if i == 0 else var + f"_{i}" | |
if _var not in bigColumnNames: | |
# the variable expr does not exist, create it | |
df = df.Define(n, _var) | |
elif n not in bigColumnNames: | |
# the variable expr exists in the df, but not the variable key | |
# use alias | |
df = df.Alias(n, _var) | |
elif n == _var and n in bigColumnNames: | |
# since the variable name and expression are equal and are already present in the df don't do anything | |
pass | |
else: | |
# FIXME | |
print("Error, cannot define variable") | |
sys.exit() | |
self.dfs[sampleName][index]["df"] = df | |
self.dfs[sampleName][index]["columnNames"] = list( | |
map(lambda k: str(k), df.GetColumnNames()) | |
) | |
def loadBranches(self): | |
""" | |
Loads branches (the ones specified in an ``alias`` with the ``tree`` key in them), | |
and checks if they are already in the dataframe columns, | |
if so it adds ``__`` at the beginning of the name. | |
Since variables are shared but not the aliases, | |
it could happen that a variable's name or expression is already defined | |
for a given df but not for another one -> | |
need to determine a common and compatible set of variables for all the many dfs. | |
This is done by gathering the largest set of column names. | |
""" | |
bigColumnNames = set() | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
columnNames = self.dfs[sampleName][index]["columnNames"] | |
bigColumnNames = bigColumnNames.union(set(columnNames)) | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
for var in list(self.variables.keys()): | |
if "tree" not in self.variables[var].keys(): | |
continue | |
for branch in list(self.variables[var]["tree"].keys()): | |
if ( | |
branch in bigColumnNames | |
and branch != self.variables[var]["tree"][branch] | |
): | |
# here I want to define a variable whose key is already in the column name list | |
# and its expression is different from its name | |
# therefore we will either create a Define or an Alias | |
# need to rename the new variable! | |
prefix = "__" | |
nVar = prefix + branch | |
while nVar in bigColumnNames: | |
prefix += "__" | |
nVar = prefix + branch | |
# print("changing variable", branch, "to " + nVar) | |
self.remappedVariables[nVar] = prefix | |
self.variables[var]["tree"][nVar] = self.variables[var][ | |
"tree" | |
][branch] | |
del self.variables[var]["tree"][branch] | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
df = self.dfs[sampleName][index]["df"] | |
for var in list(self.variables.keys()): | |
if "tree" not in self.variables[var].keys(): | |
continue | |
for branch in self.variables[var]["tree"].keys(): | |
# print("working on defining branch", branch) | |
if self.variables[var]["tree"][branch] not in bigColumnNames: | |
# the variable expr does not exist, create it | |
# print("define the branch") | |
df = df.Define(branch, self.variables[var]["tree"][branch]) | |
elif branch not in bigColumnNames: | |
# the variable expr exists in the df, but not the variable key | |
# use alias | |
# print("define an alias to the branch") | |
df = df.Alias(branch, self.variables[var]["tree"][branch]) | |
elif ( | |
branch == self.variables[var]["tree"][branch] | |
and branch in bigColumnNames | |
): | |
# since the variable name and expression are equal and are already present in the df don't do anything | |
pass | |
else: | |
# FIXME | |
print("Error, cannot define variable") | |
sys.exit() | |
self.dfs[sampleName][index]["df"] = df | |
self.dfs[sampleName][index]["columnNames"] = list( | |
map(lambda k: str(k), df.GetColumnNames()) | |
) | |
def createResults(self): | |
""" | |
Create empty dictionary for results, will store all the different histos | |
""" | |
self.results = {} | |
for cut in self.cuts.keys(): | |
self.results[cut] = {} | |
for variable in self.variables.keys(): | |
self.results[cut][variable] = {} | |
def splitSubsamples(self): | |
""" | |
Split samples into subsamples if needed | |
After this method the ``dfs`` attribute will be modified to contain the subsamples names instead of the original sample name | |
""" | |
sampleNames = set( | |
list(map(lambda k: k[0], list(filter(lambda k: len(k) == 6, self.samples)))) | |
) | |
for sampleName in sampleNames: | |
_sample = list(filter(lambda k: k[0] == sampleName, self.samples))[0] | |
for subsample in list(_sample[5].keys()): | |
self.dfs[sampleName + "_" + subsample] = {} | |
for index in self.dfs[sampleName].keys(): | |
self.dfs[sampleName + "_" + subsample][index] = { | |
"parent": sampleName | |
} | |
subsampleCut = _sample[5][subsample] | |
subsampleWeight = '1.0' | |
if isinstance(subsampleCut, tuple) or isinstance(subsampleCut, list): | |
subsampleCut = _sample[5][subsample][0] | |
subsampleWeight = _sample[5][subsample][1] | |
self.dfs[sampleName + "_" + subsample][index]["df"] = self.dfs[ | |
sampleName | |
][index]["df"].Filter(subsampleCut).Redefine('weight', 'weight * ' + subsampleWeight) | |
self.dfs[sampleName + "_" + subsample][index][ | |
"columnNames" | |
] = self.dfs[sampleName][index]["columnNames"] | |
self.dfs[sampleName + "_" + subsample][index]["ttree"] = self.dfs[ | |
sampleName | |
][index]["ttree"] | |
del self.dfs[sampleName] | |
def create_cuts_vars(self): | |
""" | |
Defines ``Histo1D`` for each variable and cut and dataframe. It also creates dictionary for variations through ``VariationsFor()`` | |
""" | |
mergedCuts = self.cuts | |
variables = self.variables | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
df = self.dfs[sampleName][index]["df"] | |
for cut in mergedCuts.keys(): | |
df_cat = df.Filter(mergedCuts[cut]["expr"]) | |
opts = ROOT.RDF.RSnapshotOptions() | |
opts.fLazy = True | |
for var in list(variables.keys()): | |
if "tree" in variables[var].keys(): | |
if not mergedCuts[cut]["parent"] in variables[var]["cuts"]: | |
# del df_cat | |
continue | |
outpath = self.outputFileMap | |
if self.outputFileMap != "output.root": | |
outfileName = ( | |
".".join( | |
self.outputFileMap.split("/")[-1].split(".")[ | |
:-1 | |
] | |
) | |
+ f"__ALL__{cut}_{sampleName}_{str(index)}.root" | |
) | |
outpath = ( | |
"/".join(self.outputFileMap.split("/")[:-1]) | |
+ "/" | |
+ outfileName | |
) | |
else: | |
outpath = f"output_{cut}_{sampleName}_{str(index)}.root" | |
# _h = df_cat.Snapshot(cut + '/' + sampleName, outpath, list(self.variables[var]['tree'].keys()), opts) | |
# FIXME always save the weight! | |
""" | |
___cols = list(self.variables[var]["tree"].keys()) + ["weight"], | |
print(___cols) | |
print(type(df_cat)) | |
print(df_cat) | |
__cols = [str(k) for k in df_cat.GetColumnNames()] | |
#print(__cols) | |
print(set(___cols[0]).difference(__cols)) | |
""" | |
_h = df_cat.Snapshot( | |
"Events", | |
outpath, | |
list(self.variables[var]["tree"].keys()) + ["weight"], | |
opts, | |
) | |
else: | |
vs = variables[var]["name"].split(":") | |
histRange = [] | |
if len(variables[var]["range"]) == len(vs): | |
# bin endges are provided by the user | |
for ax in variables[var]["range"]: | |
histRange.extend([len(ax) - 1, array("d", ax)]) | |
else: | |
histRange = variables[var]["range"] | |
histRange = tuple(histRange) | |
if len(vs) == 1: | |
_h = df_cat.Histo1D( | |
(cut + "_" + var, "") + histRange, var, "weight" | |
) | |
elif len(vs) == 2: | |
varNames = [] | |
for i, _var in enumerate(vs): | |
n = var if i == 0 else var + f"_{i}" | |
varNames.append(n) | |
_h = df_cat.Histo2D( | |
(cut + "_" + var, "") + histRange, | |
*varNames, | |
"weight", | |
) | |
else: | |
print("Unknown dimension of histo for variable", var) | |
sys.exit() | |
if sampleName not in self.results[cut][var].keys(): | |
self.results[cut][var][sampleName] = {} | |
self.results[cut][var][sampleName][index] = _h | |
for cut in mergedCuts.keys(): | |
for var in list(variables.keys()): | |
if "tree" not in variables[var].keys(): | |
_s = self.results[cut][var][sampleName][index] | |
_s_var = ROOT.RDF.Experimental.VariationsFor(_s) | |
self.results[cut][var][sampleName][index] = _s_var | |
def convertResults(self): | |
""" | |
Gather resulting histograms and fold them if needed. | |
Systematics are also saved. | |
""" | |
mergedCuts = self.cuts | |
variables = self.variables | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
for cut in mergedCuts.keys(): | |
for var in list(variables.keys()): | |
if "tree" in variables[var].keys(): | |
# no need to process SnapShots | |
continue | |
_s_var = self.results[cut][var][sampleName][index] | |
_histos = {} | |
# _histos_special_nuis = {} | |
for _variation in list(map(lambda k: str(k), _s_var.GetKeys())): | |
_h_name = _variation.replace(":", "") | |
_h = 0 | |
_h = _s_var[_variation] | |
fold = variables[var].get("fold", 0) | |
if isinstance(_h, ROOT.TH1D): | |
if fold == 1 or fold == 3: | |
_from = 0 | |
_to = 1 | |
_fold(_h, _to, _from) | |
if fold == 2 or fold == 3: | |
lastBin = _h.GetNbinsX() | |
_from = lastBin + 1 | |
_to = lastBin | |
_fold(_h, _to, _from) | |
elif isinstance(_h, ROOT.TH2D): | |
nx = _h.GetNbinsX() | |
ny = _h.GetNbinsY() | |
if fold == 1 or fold == 3: | |
for i in range(0, ny+2): | |
_from = 0 | |
_to = 0 | |
if i == 0: | |
_from = _h.GetBin(0, i) | |
_to = _h.GetBin(1, i+1) | |
elif i == ny+1: | |
_from = _h.GetBin(0, i) | |
_to = _h.GetBin(1, i-1) | |
else: | |
_from = _h.GetBin(0, i) | |
_to = _h.GetBin(1, i) | |
_fold(_h, _to, _from) | |
for i in range(0, nx+2): | |
_from = 0 | |
_to = 0 | |
if i == 0: | |
_from = _h.GetBin(i, 0) | |
_to = _h.GetBin(i+1, 1) | |
elif i == ny+1: | |
_from = _h.GetBin(i, 0) | |
_to = _h.GetBin(i-1, 1) | |
else: | |
_from = _h.GetBin(i, 0) | |
_to = _h.GetBin(i, 1) | |
_fold(_h, _to, _from) | |
if fold == 2 or fold == 3: | |
for i in range(0, ny+2): | |
_from = 0 | |
_to = 0 | |
if i == 0: | |
_from = _h.GetBin(nx+1, i) | |
_to = _h.GetBin(nx , i+1) | |
elif i == ny+1: | |
_from = _h.GetBin(nx+1, i) | |
_to = _h.GetBin(nx , i-1) | |
else: | |
_from = _h.GetBin(nx+1, i) | |
_to = _h.GetBin(nx , i) | |
_fold(_h, _to, _from) | |
for i in range(0, nx+2): | |
_from = 0 | |
_to = 0 | |
if i == 0: | |
_from = _h.GetBin(i, nx+1) | |
_to = _h.GetBin(i+1, nx) | |
elif i == ny+1: | |
_from = _h.GetBin(i, nx+1) | |
_to = _h.GetBin(i-1, nx) | |
else: | |
_from = _h.GetBin(i, nx+1) | |
_to = _h.GetBin(i, nx) | |
_fold(_h, _to, _from) | |
#unroll 2d | |
_name = _h.GetName() | |
_h.SetName(_name + "_old") | |
new_name = _name + "_" + _h_name + "_" + sampleName + "_" + str(index) | |
# print('creating histo with name', new_name) | |
_h2 = ROOT.TH1D(new_name, _h.GetName(), nx*ny, 1, nx*ny) | |
for i in range(1, nx+1): | |
for j in range(1, ny+1): | |
new_ind = (i-1) * ny + j | |
old_ind = _h.GetBin(i,j) | |
_h2.SetBinContent(new_ind, _h.GetBinContent(old_ind)) | |
_h2.SetBinError(new_ind, _h.GetBinError(old_ind)) | |
del _h | |
_h = _h2 | |
# if 'SPECIAL_NUIS' in _h_name: | |
# nuisName = _variation.split(':')[0] | |
# if nuisName in _histos_special_nuis.keys(): | |
# _histos_special_nuis[nuisName].append(_h.Clone()) | |
# else: | |
# _histos_special_nuis[nuisName] = [_h.Clone()] | |
# continue | |
_histos[_h_name] = _h#.Clone() | |
# del _h | |
# del self.results[cut][var][sampleName]['object'] | |
# replace the object with the dictionary of histos | |
# for nuisName in _histos_special_nuis.keys(): | |
# # print(nuisName) | |
# hName = nuisName.split('_SPECIAL_NUIS')[0] | |
# h_up = _histos_special_nuis[nuisName][0].Clone() | |
# h_do = _histos_special_nuis[nuisName][0].Clone() | |
# variations = np.empty((len(_histos_special_nuis[nuisName]), _histos_special_nuis[nuisName][0].GetNbinsX()+2), dtype=float) | |
# for i in range(len(_histos_special_nuis[nuisName])): | |
# variations[i, :] = hist2array(_histos_special_nuis[nuisName][i], overflow=True) | |
# arrup = 0 | |
# arrdo = 0 | |
# if nuisName.endswith('envelope'): | |
# arrup = np.max(variations, axis=0) | |
# arrdo = np.min(variations, axis=0) | |
# elif nuisName.endswith('rms'): | |
# vnominal = hist2array(_histos[hName], overflow=True) | |
# arrnom = np.tile(vnominal,(variations.shape[0], 1) ) | |
# arrv = np.sqrt(np.mean(np.square(variations - arrnom), axis=0)) | |
# arrup = vnominal + arrv | |
# arrdo = vnominal - arrv | |
# else: | |
# continue | |
# for i in range(0, h_up.GetNbinsX() +2 ): | |
# h_up.SetBinContent(i, arrup[i]) | |
# h_do.SetBinContent(i, arrdo[i]) | |
# _histos[hName + 'Up'] = h_up.Clone() | |
# _histos[hName + 'Down'] = h_do.Clone() | |
self.results[cut][var][sampleName][index] = _histos | |
def saveResults(self): | |
""" | |
Save results in a root file. | |
If ``Snapshot`` were created will merge them in a output file. | |
""" | |
f = ROOT.TFile(self.outputFileMap, "recreate") | |
toBeDeleted = [] | |
openedFiles = [] | |
openedTrees = [] | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
for cut in self.cuts.keys(): | |
for var in list(self.variables.keys()): | |
if "tree" in self.variables[var].keys(): | |
if ( | |
not self.cuts[cut]["parent"] | |
in self.variables[var]["cuts"] | |
): | |
continue | |
if self.outputFileMap != "output.root": | |
outfileName = ( | |
".".join( | |
self.outputFileMap.split("/")[-1].split(".")[ | |
:-1 | |
] | |
) | |
+ f"__ALL__{cut}_{sampleName}_{str(index)}.root" | |
) | |
outpath = ( | |
"/".join(self.outputFileMap.split("/")[:-1]) | |
+ "/" | |
+ outfileName | |
) | |
else: | |
outpath = f"output_{cut}_{sampleName}_{str(index)}.root" | |
_f = ROOT.TFile(outpath) | |
openedFiles.append(_f) | |
if _f.GetListOfKeys().Contains("Events"): | |
t = _f.Get("Events") | |
openedTrees.append(t) | |
f.cd("/") | |
folder = f"trees/{cut}/{sampleName}/" | |
# print("Creating dir", folder, outpath) | |
toBeDeleted.append(outpath) | |
ROOT.gDirectory.mkdir(folder, "", True) | |
ROOT.gDirectory.cd("/" + folder) | |
t.CloneTree().Write() | |
else: | |
print( | |
"No TTree was created for", | |
cut, | |
var, | |
sampleName, | |
index, | |
"even if requested", | |
file=sys.stderr, | |
) | |
f.cd("/") | |
for cut_cat in list(self.results.keys()): | |
_cut_cat = f.mkdir(cut_cat) | |
for var in list(self.results[cut_cat].keys()): | |
if "tree" in self.variables[var].keys(): | |
continue | |
_var = "" | |
if var in list(self.remappedVariables.keys()): | |
# remove the __ at the beginning | |
_var = var[len(self.remappedVariables[var]) :] | |
else: | |
_var = var | |
_cut_cat.mkdir(_var) | |
f.cd("/" + cut_cat + "/" + _var) | |
for sampleName in list(self.results[cut_cat][var].keys()): | |
# should first merge histos | |
mergedHistos = {} | |
for index in list(self.results[cut_cat][var][sampleName].keys()): | |
for hname in list( | |
self.results[cut_cat][var][sampleName][index].keys() | |
): | |
if hname not in mergedHistos.keys(): | |
mergedHistos[hname] = self.results[cut_cat][var][ | |
sampleName | |
][index][hname].Clone() | |
else: | |
mergedHistos[hname].Add( | |
self.results[cut_cat][var][sampleName][index][hname] | |
) | |
for hname in mergedHistos.keys(): | |
if hname == "nominal": | |
_string = "histo_" + sampleName | |
mergedHistos[hname].SetName(_string) | |
mergedHistos[hname].SetTitle(_string) | |
else: | |
_string = "histo_" + sampleName + "_" + hname | |
mergedHistos[hname].SetName(_string) | |
mergedHistos[hname].SetTitle(_string) | |
mergedHistos[hname].Write() | |
f.Close() | |
# clean snapshot files | |
proc = "" | |
for _file in toBeDeleted: | |
proc += f" rm {_file}; " | |
import subprocess | |
_proc = subprocess.Popen(proc, shell=True) | |
_proc.wait() | |
# print(proc) | |
def mergeSaveResults(self): | |
f = ROOT.TFile(self.outputFileMap, "recreate") | |
for cut_cat in list(self.results.keys()): | |
_cut_cat = f.mkdir(cut_cat) | |
for var in list(self.results[cut_cat].keys()): | |
if "tree" in self.variables[var].keys(): | |
# no need to process SnapShots | |
continue | |
_cut_cat.mkdir(var) | |
f.cd("/" + cut_cat + "/" + var) | |
for sampleName in list(self.results[cut_cat][var].keys()): | |
# should first merge histos | |
mergedHistos = {} | |
for index in list(self.results[cut_cat][var][sampleName].keys()): | |
for hname in list( | |
self.results[cut_cat][var][sampleName][index].keys() | |
): | |
if hname not in mergedHistos.keys(): | |
mergedHistos[hname] = self.results[cut_cat][var][ | |
sampleName | |
][index][hname].Clone() | |
else: | |
mergedHistos[hname].Add( | |
self.results[cut_cat][var][sampleName][index][hname] | |
) | |
for hname in mergedHistos.keys(): | |
if hname == "nominal": | |
mergedHistos[hname].SetName("histo_" + sampleName) | |
else: | |
mergedHistos[hname].SetName( | |
"histo_" + sampleName + "_" + hname | |
) | |
mergedHistos[hname].Write() | |
f.Close() | |
def mergeAndSaveResults(self): | |
f = ROOT.TFile(self.outputFileMap, "recreate") | |
for cut_cat in list(self.results.keys()): | |
_cut_cat = f.mkdir(cut_cat) | |
for var in list(self.results[cut_cat].keys()): | |
if "tree" in self.variables[var].keys(): | |
# no need to process SnapShots | |
continue | |
_cut_cat.mkdir(var) | |
f.cd("/" + cut_cat + "/" + var) | |
for sampleName in list(self.results[cut_cat][var].keys()): | |
# should first merge histos | |
mergedHistos = {} | |
for index in list(self.results[cut_cat][var][sampleName].keys()): | |
for hname in list( | |
self.results[cut_cat][var][sampleName][index].keys() | |
): | |
if hname not in mergedHistos.keys(): | |
mergedHistos[hname] = self.results[cut_cat][var][ | |
sampleName | |
][index][hname].Clone() | |
else: | |
mergedHistos[hname].Add( | |
self.results[cut_cat][var][sampleName][index][hname] | |
) | |
for hname in mergedHistos.keys(): | |
if hname == "nominal": | |
mergedHistos[hname].SetName("histo_" + sampleName) | |
else: | |
mergedHistos[hname].SetName( | |
"histo_" + sampleName + "_" + hname | |
) | |
mergedHistos[hname].Write() | |
f.Close() | |
def run(self): | |
""" | |
Runs the analysis: | |
1. load the aliases without the ``afterNuis`` option | |
2. load the ``suffix`` systematics | |
3. load the alias ``weight`` | |
4. load the reweight systematics (they need the ``weight`` to be defined) | |
5. finally load the suffix systematics with the ``afterNuis`` option | |
After this important procedure it filters with ``preselection`` the many ``dfs``, loads systematics | |
loads ``variables``, creates the results dict, splits the samples, creates the cuts/var histos, runs the dataframes | |
and saves results. | |
""" | |
# load all aliases needed before nuisances of type suffix | |
self.loadAliases() | |
self.loadSystematicsSuffix() | |
# load alias weight needed before nuisances of type weight | |
self.loadAliasWeight() | |
self.loadSystematicsReweights() | |
self.loadSystematicsReweightsEnvelopeRMS() | |
# load all aliases remaining | |
self.loadAliases(True) | |
# apply preselections | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
self.dfs[sampleName][index]["df"] = self.dfs[sampleName][index][ | |
"df" | |
# ].Filter("(" + self.preselections + ") && abs(weight) > 0.0") | |
].Filter("(" + self.preselections + ")") | |
self.loadVariables() | |
self.loadBranches() | |
print("loaded all variables") | |
self.createResults() | |
print("created empty results dict") | |
#print(self.cuts) | |
#print(self.results) | |
self.splitSubsamples() | |
print("splitted samples") | |
self.create_cuts_vars() | |
print("created cuts") | |
""" | |
# FIXME RunGraphs can't handle results of VaraitionsFor, ask Vincenzo about it | |
# collect all the dataframes and run them | |
dfs = [] | |
for cut in self.cuts.keys(): | |
for var in self.variables.keys(): | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
# dfs.append(self.results[cut][var][sampleName][index]) | |
dfs.extend( | |
list(self.results[cut][var][sampleName][index].values())) | |
ROOT.RDF.RunGraphs(dfs) | |
""" | |
counts = [] | |
# register number of events in each df | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
counts.append(self.dfs[sampleName][index]["df"].Count()) | |
snapshots = [] | |
for cut in self.cuts.keys(): | |
for var in self.variables.keys(): | |
if ( | |
len(self.results[cut].get(var, [])) == 0 | |
or "tree" not in self.variables[var].keys() | |
): | |
# no snapshots for this combination of cut variable | |
continue | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
# dfs.append(self.results[cut][var][sampleName][index]) | |
snapshots.append(self.results[cut][var][sampleName][index]) | |
if len(snapshots) != 0: | |
ROOT.RDF.RunGraphs(snapshots) | |
for count in counts: | |
print("Number of events passing preselections:", count.GetValue()) | |
#print(self.results) | |
self.convertResults() | |
self.saveResults() | |
if __name__ == "__main__": | |
ROOT.gInterpreter.Declare('#include "headers.hh"') | |
exec(open("script.py").read()) | |
runner = RunAnalysis(samples, aliases, variables, cuts, nuisances, lumi) | |
runner.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment