Created
January 29, 2024 09:45
-
-
Save giorgiopizz/8df9f35d5d8cd0d579b78516c630bc6f to your computer and use it in GitHub Desktop.
New runner.py to unroll and fold in mkShapesRDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from copy import deepcopy | |
import sys | |
import ROOT | |
from array import array | |
from mkShapesRDF.lib.parse_cpp import ParseCpp | |
import os | |
from math import sqrt | |
ROOT.gROOT.SetBatch(True) | |
ROOT.TH1.SetDefaultSumw2(True) | |
import numpy as np | |
# def hist2array(h, overflow=False, copy=True): | |
# shift = 0 | |
# if overflow: | |
# shift = 1 | |
# d = [] | |
# for i in range(1 - shift, h.GetNbinsX() + 1 + shift): | |
# d.append(h.GetBinContent(i)) | |
# d = np.array(d) | |
# return d | |
def hist2array(h, flow=False, copy=True, include_sumw2=True): | |
nx = h.GetNbinsX() + 2 | |
dtype=0 | |
if isinstance(h, ROOT.TH1D): | |
dtype=np.double | |
elif isinstance(h, ROOT.TH1F): | |
dtype=np.float | |
elif isinstance(h, ROOT.TH1I): | |
dtype=np.int | |
else: | |
print('Histogram of type', h, 'is not supperted', file=sys.stderr) | |
sys.exit(1) | |
vals = np.ndarray((nx, ), dtype=dtype, buffer=h.GetArray()) | |
sumw2 = np.ndarray((nx, ), dtype=dtype, buffer=h.GetSumw2().GetArray()) | |
shift = 0 | |
if flow: | |
shift = 1 | |
vals = vals[shift: nx-shift] | |
sum2 = sumw2[shift: nx-shift] | |
if copy: | |
vals = vals.copy() | |
sumw2 = sumw2.copy() | |
if include_sumw2: | |
return vals, sumw2 | |
else: | |
return vals | |
def _fold(_h, _to, _from): | |
_h.SetBinContent(_to, _h.GetBinContent(_to) + _h.GetBinContent(_from)) | |
sumw2 = _h.GetSumw2() | |
#_h.SetBinError(_to, sqrt(_h.GetBinError(_to) **2 + _h.GetBinError(_from)**2)) | |
sumw2[_to] = sumw2[_to] + sumw2[_from] | |
_h.SetBinContent(_from, 0.0) | |
sumw2[_from] = 0.0 | |
#_h.SetBinError(_from, 0.0) | |
class RunAnalysis: | |
r"""Class athat craeates ``dfs`` and runs the analysiss""" | |
@staticmethod | |
def splitSamples(samples, useFilesPerJob=True): | |
r"""static methods, takes a dictionary of samples and split them based on their weights and max num. of files | |
Parameters | |
---------- | |
samples : dict | |
dictionary of samples | |
useFilesPerJob : bool, optional, default: True | |
if you want to further split the samples based on max num. of files. | |
Returns | |
------- | |
`list of tuple` | |
each tuple will have a lenght of 5 (6 if subsamples are present), where the first element is the name of the sample, the second the list of files, the third the weight, and the fourth the index of this tuple compared to the other tuples of the same sample type, the fifth will be the isData flag (True if the sample is data, False otherwise). If subsamples are present, the sixth element will be the dict of subsamples | |
""" | |
# will contain all the different samples splitted based on their weights and max num. of files | |
splittedSamples = [] | |
for sampleName in list(samples.keys()): | |
types = ( | |
{} | |
) # this will contain the the different type of files with their special weights for this sampleName | |
# recall that samples[sampleName]['name'] contain a list of tuples, where a single tuple can have a size of 2 or 3: | |
# first position for name of the process (DYJetsToLL_M-50), second list of files, third a possible special weight | |
for filesType in samples[sampleName]["name"]: | |
# no special weight (will use '1'), and at least one file found | |
if len(filesType) == 2 and len(filesType[1]) > 0: | |
if "1" not in list(types.keys()): | |
types["1"] = [] | |
types["1"].append(filesType + ("1.0",)) | |
elif len(filesType) == 3 and len(filesType[1]) > 0: | |
if filesType[2] not in list(types.keys()): | |
types[filesType[2]] = [] | |
types[filesType[2]].append(filesType) | |
else: | |
print("Error", sampleName, filesType, file=sys.stderr) | |
print( | |
"Either the sample proc you specified has no files, or the weight had problems", | |
file=sys.stderr, | |
) | |
sys.exit() | |
i = 0 | |
for _type in list(types.keys()): | |
# split files based on FilesPerJob or include them all | |
__files = list(map(lambda k: k[1], types[_type])) | |
# flatted list of files | |
__files = [item for sublist in __files for item in sublist] | |
if useFilesPerJob: | |
dim = samples[sampleName].get("FilesPerJob", len(__files)) | |
else: | |
dim = len(__files) | |
__files = [__files[j : j + dim] for j in range(0, len(__files), dim)] | |
for ___files in __files: | |
# the weights for these files will be the product of the weight inside this sampele (i.e. samples[sampleName]['weight']) | |
# and the product of the special weights that is in common to all of those files (i.e. sampleType[0]) | |
# the common special weight can be retrived from the first of the list of files with this weight | |
# remember that the tuple has always size 3 now, the last position is for the special weight | |
weight = ( | |
"( " | |
+ samples[sampleName]["weight"] | |
+ " ) * ( " | |
+ types[_type][0][2] | |
+ " )" | |
) | |
isData = samples[sampleName].get("isData", False) | |
sampleType = (sampleName, ___files, weight, i, isData) | |
if "subsamples" in list(samples[sampleName].keys()): | |
sampleType += (samples[sampleName]["subsamples"],) | |
splittedSamples.append(sampleType) | |
i += 1 | |
return splittedSamples | |
@staticmethod | |
def getTTreeNomAndFriends(fnom, friends): | |
"""Create a TChain with the nominal files and the friends files (nuisances TTrees with varied branches) | |
Args: | |
fnom (list): list of nominal files | |
friends (list of list): list of list of friends files | |
Returns: | |
TChain: TChain with nominal files and friends files | |
""" | |
tnom = ROOT.TChain("Events") | |
for f in fnom: | |
tnom.Add(f) | |
for friend in friends: | |
_tfriend = ROOT.TChain("Events") | |
for f in friend: | |
_tfriend.Add(f) | |
tnom.AddFriend(_tfriend) | |
return tnom | |
@staticmethod | |
def getNuisanceFiles(nuisance, files): | |
"""Searches in the provided nuisance folder for the files with the same name of the nominal files | |
Args: | |
nuisance (dict): dict with the nuisance information | |
files (list): list of nominal files | |
Returns: | |
list of list: list with the down and up varied list of files | |
""" | |
_files = list(map(lambda k: k.split("/")[-1], files)) | |
nuisanceFilesDown = list( | |
map(lambda k: nuisance["folderDown"] + "/" + k, _files) | |
) | |
nuisanceFilesUp = list(map(lambda k: nuisance["folderUp"] + "/" + k, _files)) | |
for nuisFile in nuisanceFilesUp + nuisanceFilesDown: | |
# print(nuisFile) | |
if not os.path.exists(nuisFile): | |
print("File", nuisFile, "does not exist", file=sys.stderr) | |
sys.exit(1) | |
return [nuisanceFilesDown, nuisanceFilesUp] | |
@staticmethod | |
def index_sub(string, sub): | |
try: | |
return string.index(sub) | |
except ValueError: | |
return -10000 | |
def __init__( | |
self, | |
samples, | |
aliases, | |
variables, | |
cuts, | |
nuisances, | |
lumi, | |
limit=-1, | |
outputFileMap="output.root", | |
): | |
r""" | |
Stores arguments in the class attributes and creates all the RDataFrame objects | |
Parameters | |
---------- | |
samples : `list of tuple` | |
same type as the return of the splitSamples method | |
aliases : dict | |
dict of aliases | |
variables : dict | |
dict of variables | |
cuts : dict | |
dict of cuts, contains two keys (preselections: str, cuts: dict) | |
nuisances : dict | |
dict of nuisances | |
lumi : float | |
lumi in fb-1 | |
limit : int, optional, default: -1 | |
limit of events to be processed | |
outputFileMap : str, optional, defaults: 'output.root' | |
full path + filename of the output root file. | |
Returns | |
------- | |
None | |
""" | |
self.samples = samples | |
self.aliases = aliases | |
self.variables = variables | |
self.preselections = cuts["preselections"] | |
mergedCuts = {} | |
for cut in list(cuts["cuts"].keys()): | |
__cutExpr = "" | |
if type(cuts["cuts"][cut]) == dict: | |
__cutExpr = cuts["cuts"][cut]["expr"] | |
for cat in list(cuts["cuts"][cut]["categories"].keys()): | |
mergedCuts[cut + "_" + cat] = {"parent": cut} | |
mergedCuts[cut + "_" + cat]["expr"] = ( | |
__cutExpr + " && " + cuts["cuts"][cut]["categories"][cat] | |
) | |
elif type(cuts["cuts"][cut]) == str: | |
__cutExpr = cuts["cuts"][cut] | |
mergedCuts[cut] = {"expr": __cutExpr, "parent": cut} | |
self.cuts = mergedCuts | |
self.nuisances = nuisances | |
self.lumi = lumi | |
self.limit = limit | |
self.outputFileMap = outputFileMap | |
self.remappedVariables = {} | |
self.dfs = {} | |
r""" | |
dfs is a dictionary containing as keys the sampleNames. The structure should look like this: | |
.. code-block:: python | |
dfs = { | |
'DY':{ | |
0: { | |
'df': obj, | |
'columnNames': [...], | |
'usedVariables': [...], | |
'ttree': obj2, # needed otherwise seg fault in root | |
}, | |
} | |
} | |
""" | |
usedVariables = set() | |
usedVariables = usedVariables.union( | |
ParseCpp.listOfVariables(ParseCpp.parse(self.preselections)) | |
) | |
for cut in mergedCuts.keys(): | |
usedVariables = usedVariables.union( | |
ParseCpp.listOfVariables(ParseCpp.parse(mergedCuts[cut]["expr"])) | |
) | |
for var in variables.keys(): | |
if "name" in variables[var].keys(): | |
__var = variables[var]["name"].split(":") | |
__vars = list( | |
map(lambda k: ParseCpp.listOfVariables(ParseCpp.parse(k)), __var) | |
) | |
for __v in __vars: | |
usedVariables = usedVariables.union(__v) | |
elif "tree" in variables[var].keys(): | |
for branch in variables[var]["tree"].keys(): | |
usedVariables = usedVariables.union( | |
ParseCpp.listOfVariables( | |
ParseCpp.parse(variables[var]["tree"][branch]) | |
) | |
) | |
else: | |
print("Cannot process variable ", var, " nuisances might be faulty") | |
# sample here is a tuple, first el is the sampleName, second list of files, | |
# third the special weight, forth is the index of tuple for the same sample, | |
# fifth if present the dict of subsamples | |
for sample in samples: | |
files = sample[1] | |
sampleName = sample[0] | |
friendsFiles = [] | |
usedFolders = [] | |
for nuisance in self.nuisances.values(): | |
if sampleName not in nuisance.get("samples", {sampleName: []}): | |
continue | |
if nuisance.get("type", "") == "shape": | |
if nuisance.get("kind", "") == "suffix": | |
if nuisance.get("folderUp", "") != "": | |
if nuisance["folderUp"] in usedFolders: | |
continue | |
usedFolders.append(nuisance["folderUp"]) | |
friendsFiles += RunAnalysis.getNuisanceFiles( | |
nuisance, files | |
) | |
tnom = RunAnalysis.getTTreeNomAndFriends(files, friendsFiles) | |
if limit != -1: | |
df = ROOT.RDataFrame(tnom) | |
df = df.Range(limit) | |
else: | |
#ROOT.EnableImplicitMT() | |
df = ROOT.RDataFrame(tnom) | |
if sampleName not in self.dfs.keys(): | |
self.dfs[sample[0]] = {} | |
self.dfs[sampleName][sample[3]] = { | |
"df": df, | |
"ttree": tnom, | |
"usedVariables": list(usedVariables), | |
} | |
self.dfs[sampleName][sample[3]]["columnNames"] = list( | |
map(lambda k: str(k), df.GetColumnNames()) | |
) | |
self.definedAliases = {} | |
print("\n\nLoaded dataframes\n\n") | |
def loadAliases(self, afterNuis=False): | |
""" | |
Load aliases in the dataframes. It does not create the special alias ``weight`` for which a special method is used. | |
Parameters | |
---------- | |
afterNuis : bool, optional, default: False | |
if True, only aliases with the key ``afterNuis`` set to True will be loaded | |
""" | |
aliases = self.aliases | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
df = self.dfs[sampleName][index]["df"] | |
define_string = "df" | |
usedVariables = set(self.dfs[sampleName][index]["usedVariables"]) | |
for alias in list(aliases.keys()): | |
# only load aliases needed for nuisances! | |
# if beforeNuis | |
if (afterNuis) != (aliases[alias].get("afterNuis", False)): | |
if "expr" in aliases[alias]: | |
usedVariables = usedVariables.union( | |
ParseCpp.listOfVariables( | |
ParseCpp.parse(aliases[alias]["expr"]) | |
) | |
) | |
continue | |
if "samples" in list(aliases[alias]): | |
if sampleName not in aliases[alias]["samples"]: | |
continue | |
if alias in self.dfs[sampleName][index]["columnNames"]: | |
if alias != aliases[alias].get("expr", ""): | |
print( | |
f"Alias {alias} cannot be defined, column with that name already exists" | |
) | |
sys.exit() | |
else: | |
# since the alias key is equal to expr there's no need to keep the alias | |
del self.aliases[alias] | |
continue | |
for line in aliases[alias].get("linesToAdd", []): | |
# RPLME_nThreads is used to set the number of threads | |
ROOT.gInterpreter.Declare( | |
line.replace("RPLME_nThreads", str(df.GetNSlots())) | |
) | |
for line in aliases[alias].get("linesToProcess", []): | |
# RPLME_nThreads is used to set the number of threads | |
exec(line, globals()) | |
if "expr" in list(aliases[alias].keys()): | |
define_string += ( | |
f".Define('{alias}', '{aliases[alias]['expr']}') \\\n\t" | |
) | |
usedVariables = usedVariables.union( | |
ParseCpp.listOfVariables( | |
ParseCpp.parse(aliases[alias]["expr"]) | |
) | |
) | |
elif "exprSlot" in list(aliases[alias].keys()): | |
args = aliases[alias]["exprSlot"] | |
args[0] = args[0].replace("RPLME_nThreads", str(df.GetNSlots())) | |
define_string += ( | |
f".DefineSlot('{alias}', '{args[0]}', [ {args[1]} ]) \\\n\t" | |
) | |
usedVariables = usedVariables.union( | |
ParseCpp.listOfVariables( | |
ParseCpp.parse(aliases[alias]["exprSlot"][1]) | |
) | |
) | |
elif "class" in list(aliases[alias].keys()): | |
define_string += f".Define('{alias}', '{aliases[alias]['class']} ( {aliases[alias].get('args', '')} )') \\\n\t" | |
df1 = eval(define_string) | |
self.dfs[sampleName][index]["df"] = df1 | |
stringPrint = "before nuisances" | |
if afterNuis: | |
stringPrint = "after nuisances" | |
print( | |
f"\n\nLoaded all aliases {stringPrint} for {sampleName} index {index}\n\n" | |
) | |
self.dfs[sampleName][index]["usedVariables"] = list(usedVariables) | |
self.dfs[sampleName][index]["columnNames"] = list( | |
map(lambda k: str(k), df1.GetColumnNames()) | |
) | |
def loadAliasWeight(self): | |
""" | |
Loads only the special alias ``weight`` in the dataframes. | |
""" | |
samples = self.samples | |
aliases = self.aliases | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
df = self.dfs[sampleName][index]["df"] # noqa: F841 | |
# Define the most importante alias: the weight! | |
sample = list( | |
filter(lambda k: k[0] == sampleName and k[3] == index, samples) | |
)[0] | |
weight = sample[2] | |
isData = sample[4] | |
if not isData: | |
aliases["weight"] = {"expr": str(self.lumi) + " * " + weight} | |
else: | |
aliases["weight"] = {"expr": weight} | |
print("\n\n", sampleName, "\n\n", aliases["weight"]) | |
# load the alias weight | |
define_string = "df" | |
alias = "weight" | |
if alias in self.dfs[sampleName][index]["columnNames"]: | |
print( | |
f"Alias {alias} cannot be defined, column with that name already exists" | |
) | |
sys.exit() | |
define_string += ( | |
f".Define('{alias}', '{aliases[alias]['expr']}') \\\n\t" | |
) | |
df1 = eval(define_string) | |
self.dfs[sampleName][index]["df"] = df1 | |
print(f"\n\nLoaded alias weight for {sampleName} index {index}\n\n") | |
self.dfs[sampleName][index]["columnNames"].append("weight") | |
def loadSystematicsSuffix(self): | |
""" | |
Loads systematics of type ``suffix`` in the dataframes. | |
""" | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
df = self.dfs[sampleName][index]["df"] | |
columnNames = self.dfs[sampleName][index]["columnNames"] | |
nuisances = self.nuisances | |
# nuisance key is not used | |
for _, nuisance in list(nuisances.items()): | |
if sampleName not in nuisance.get("samples", {sampleName: []}): | |
continue | |
if nuisance.get("type", "") == "shape": | |
if nuisance.get("kind", "") == "suffix": | |
variation = nuisance["mapDown"] | |
separator = nuisance.get("separator", "_") | |
variedCols = list( | |
filter(lambda k: k.endswith(variation), columnNames) | |
) | |
if len(variedCols) == 0: | |
print(f"No varied columns for {variation}") | |
sys.exit() | |
# if 'JER' in nuisance['name']: | |
# print(variedCols) | |
baseCols = list( | |
map( | |
lambda k: k[ | |
RunAnalysis.index_sub(k, "Events.") | |
+ len("Events.") : -len(separator + variation) | |
] | |
if "Events" in k | |
else k[: -len(separator + variation)], | |
variedCols, | |
) | |
) | |
for baseCol in baseCols: | |
if ( | |
baseCol | |
not in self.dfs[sampleName][index]["usedVariables"] | |
): | |
# baseCol is never used -> useless to register variation | |
print("unused variable", baseCol) | |
continue | |
print(baseCol in [str(k) for k in df.GetColumnNames()]) | |
if not (baseCol in [str(k) for k in df.GetColumnNames()]): | |
continue | |
if "bool" not in str(df.GetColumnType(baseCol)).lower(): | |
varNameDown = ( | |
baseCol | |
+ separator | |
+ nuisance["mapDown"] | |
+ "*" | |
+ nuisance["samples"][sampleName][1] | |
) | |
varNameUp = ( | |
baseCol | |
+ separator | |
+ nuisance["mapUp"] | |
+ "*" | |
+ nuisance["samples"][sampleName][0] | |
) | |
else: | |
varNameDown = ( | |
baseCol + separator + nuisance["mapDown"] | |
) | |
varNameUp = baseCol + separator + nuisance["mapUp"] | |
_type = df.GetColumnType(baseCol) | |
expr = ( | |
ParseCpp.RVecExpression(_type) | |
+ "{" | |
+ f"{varNameDown}, {varNameUp}" | |
+ "}" | |
) | |
if "JER" in nuisance["name"]: | |
print("JER expr", expr) | |
df = df.Vary( | |
baseCol, | |
expr, | |
variationTags=["Down", "Up"], | |
variationName=nuisance["name"], | |
) | |
else: | |
continue | |
# else: | |
# print("Unsupported nuisance") | |
# sys.exit() | |
self.dfs[sampleName][index]["df"] = df | |
def loadSystematicsReweights(self): | |
""" | |
Loads systematics of type ``suffix`` in the dataframes. | |
""" | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
df = self.dfs[sampleName][index]["df"] | |
columnNames = self.dfs[sampleName][index]["columnNames"] | |
nuisances = self.nuisances | |
# nuisance key is not used | |
for _, nuisance in list(nuisances.items()): | |
if sampleName not in nuisance.get("samples", {sampleName: []}): | |
continue | |
if nuisance.get("type", "") == "shape": | |
if nuisance.get("kind", "") == "weight": | |
weights = nuisance["samples"].get(sampleName, None) | |
if weights is not None: | |
variedNames = [] | |
if weights[0] not in columnNames: | |
df = df.Define(nuisance["name"] + "_up", weights[0]) | |
variedNames.append(nuisance["name"] + "_up") | |
else: | |
variedNames.append(weights[0]) | |
if weights[1] not in columnNames: | |
df = df.Define( | |
nuisance["name"] + "_down", weights[1] | |
) | |
variedNames.append(nuisance["name"] + "_down") | |
else: | |
variedNames.append(weights[1]) | |
if df.GetColumnType("weight") == "double": | |
expr = ( | |
"ROOT::RVecD" | |
+ "{ weight * (double)" | |
+ f"{variedNames[1]}, weight * (double) {variedNames[0]}" | |
+ "}" | |
) | |
elif df.GetColumnType("weight") == "float": | |
expr = ( | |
"ROOT::RVecF" | |
+ "{ weight * (float)" | |
+ f"{variedNames[1]}, weight * (float) {variedNames[0]}" | |
+ "}" | |
) | |
else: | |
print( | |
"Weight column has unknown type:", | |
df.GetColumnType("weight"), | |
"while varied is: ", | |
df.GetColumnType(variedNames[1]), | |
) | |
sys.exit() | |
df = df.Vary( | |
"weight", | |
expr, | |
variationTags=["Down", "Up"], | |
variationName=nuisance["name"], | |
) | |
else: | |
continue | |
# else: | |
# print("Unsupported nuisance") | |
# sys.exit() | |
self.dfs[sampleName][index]["df"] = df | |
def loadSystematicsReweightsEnvelopeRMS(self): | |
""" | |
Loads systematics of type ``weight_envelope`` or type ``weight_rms`` in the dataframes. | |
""" | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
df = self.dfs[sampleName][index]["df"] | |
columnNames = self.dfs[sampleName][index]["columnNames"] | |
nuisances = self.nuisances | |
# nuisance key is not used | |
for _, nuisance in list(nuisances.items()): | |
if sampleName not in nuisance.get("samples", {sampleName: []}): | |
continue | |
if nuisance.get("type", "") == "shape": | |
if ( | |
nuisance.get("kind", "") == "weight_envelope" | |
or nuisance.get("kind", "") == "weight_rms" | |
or nuisance.get("kind", "") == "weight_square" | |
): | |
weights = nuisance["samples"].get(sampleName, None) | |
if weights is not None: | |
variedNames = [] | |
variedTags = [] | |
for i, weight in enumerate(weights): | |
if weight not in columnNames: | |
nuisName = f'{nuisance["name"]}_{i}' | |
df = df.Define(nuisName, weight) | |
variedNames.append(nuisName) | |
variedTags.append(str(i)) | |
else: | |
variedNames.append(weight) | |
variedTags.append(str(i)) | |
if df.GetColumnType("weight") == "double": | |
expr = ( | |
"ROOT::RVecD" | |
+ "{ " | |
+ ", ".join( | |
[ | |
f"weight * (double) {variedName}" | |
for variedName in variedNames | |
] | |
) | |
+ "}" | |
) | |
elif df.GetColumnType("weight") == "float": | |
expr = ( | |
"ROOT::RVecF" | |
+ "{ " | |
+ ", ".join( | |
[ | |
f"weight * (float) {variedName}" | |
for variedName in variedNames | |
] | |
) | |
+ "}" | |
) | |
else: | |
print( | |
"Weight column has unknown type:", | |
df.GetColumnType("weight"), | |
"while varied is: ", | |
df.GetColumnType(variedNames[0]), | |
) | |
sys.exit() | |
df = df.Vary( | |
"weight", | |
expr, | |
variationTags=variedTags, | |
variationName=nuisance["name"] | |
+ "_SPECIAL_NUIS_" | |
+ nuisance["kind"].split("_")[-1], | |
) | |
else: | |
continue | |
self.dfs[sampleName][index]["df"] = df | |
def loadVariables(self): | |
""" | |
Loads variables (not the ones with the 'tree' key in them), | |
and checks if they are already in the dataframe columns, | |
if so it adds ``__`` at the beginning of the name. | |
Since variables are shared but not the aliases, | |
it could happen that a variable's name or expression is already defined | |
for a given df but not for another one -> | |
need to determine a common and compatible set of variables for all the many dfs. | |
This is done by gathering the largest set of column names. | |
""" | |
bigColumnNames = set() | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
columnNames = self.dfs[sampleName][index]["columnNames"] | |
bigColumnNames = bigColumnNames.union(set(columnNames)) | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
for var in list(self.variables.keys()): | |
if "tree" in self.variables[var].keys(): | |
continue | |
if var in bigColumnNames and var != self.variables[var]["name"]: | |
# here I want to define a variable whose key is already in the column name list | |
# and its expression is different from its name | |
# therefore we will either create a Define or an Alias | |
# need to rename the new variable! | |
prefix = "__" | |
nVar = prefix + var | |
while nVar in bigColumnNames: | |
prefix += "__" | |
nVar = prefix + var | |
# print("changing variable", var, "to " + nVar) | |
self.remappedVariables[nVar] = prefix | |
self.variables[nVar] = deepcopy(self.variables[var]) | |
del self.variables[var] | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
df = self.dfs[sampleName][index]["df"] | |
for var in list(self.variables.keys()): | |
if "tree" in self.variables[var].keys(): | |
continue | |
for i, _var in enumerate(self.variables[var]["name"].split(":")): | |
n = var if i == 0 else var + f"_{i}" | |
if _var not in bigColumnNames: | |
# the variable expr does not exist, create it | |
df = df.Define(n, _var) | |
elif n not in bigColumnNames: | |
# the variable expr exists in the df, but not the variable key | |
# use alias | |
df = df.Alias(n, _var) | |
elif n == _var and n in bigColumnNames: | |
# since the variable name and expression are equal and are already present in the df don't do anything | |
pass | |
else: | |
# FIXME | |
print("Error, cannot define variable") | |
sys.exit() | |
self.dfs[sampleName][index]["df"] = df | |
self.dfs[sampleName][index]["columnNames"] = list( | |
map(lambda k: str(k), df.GetColumnNames()) | |
) | |
def loadBranches(self): | |
""" | |
Loads branches (the ones specified in an ``alias`` with the ``tree`` key in them), | |
and checks if they are already in the dataframe columns, | |
if so it adds ``__`` at the beginning of the name. | |
Since variables are shared but not the aliases, | |
it could happen that a variable's name or expression is already defined | |
for a given df but not for another one -> | |
need to determine a common and compatible set of variables for all the many dfs. | |
This is done by gathering the largest set of column names. | |
""" | |
bigColumnNames = set() | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
columnNames = self.dfs[sampleName][index]["columnNames"] | |
bigColumnNames = bigColumnNames.union(set(columnNames)) | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
for var in list(self.variables.keys()): | |
if "tree" not in self.variables[var].keys(): | |
continue | |
for branch in list(self.variables[var]["tree"].keys()): | |
if ( | |
branch in bigColumnNames | |
and branch != self.variables[var]["tree"][branch] | |
): | |
# here I want to define a variable whose key is already in the column name list | |
# and its expression is different from its name | |
# therefore we will either create a Define or an Alias | |
# need to rename the new variable! | |
prefix = "__" | |
nVar = prefix + branch | |
while nVar in bigColumnNames: | |
prefix += "__" | |
nVar = prefix + branch | |
# print("changing variable", branch, "to " + nVar) | |
self.remappedVariables[nVar] = prefix | |
self.variables[var]["tree"][nVar] = self.variables[var][ | |
"tree" | |
][branch] | |
del self.variables[var]["tree"][branch] | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
df = self.dfs[sampleName][index]["df"] | |
for var in list(self.variables.keys()): | |
if "tree" not in self.variables[var].keys(): | |
continue | |
for branch in self.variables[var]["tree"].keys(): | |
# print("working on defining branch", branch) | |
if self.variables[var]["tree"][branch] not in bigColumnNames: | |
# the variable expr does not exist, create it | |
# print("define the branch") | |
df = df.Define(branch, self.variables[var]["tree"][branch]) | |
elif branch not in bigColumnNames: | |
# the variable expr exists in the df, but not the variable key | |
# use alias | |
# print("define an alias to the branch") | |
df = df.Alias(branch, self.variables[var]["tree"][branch]) | |
elif ( | |
branch == self.variables[var]["tree"][branch] | |
and branch in bigColumnNames | |
): | |
# since the variable name and expression are equal and are already present in the df don't do anything | |
pass | |
else: | |
# FIXME | |
print("Error, cannot define variable") | |
sys.exit() | |
self.dfs[sampleName][index]["df"] = df | |
self.dfs[sampleName][index]["columnNames"] = list( | |
map(lambda k: str(k), df.GetColumnNames()) | |
) | |
def createResults(self): | |
""" | |
Create empty dictionary for results, will store all the different histos | |
""" | |
self.results = {} | |
for cut in self.cuts.keys(): | |
self.results[cut] = {} | |
for variable in self.variables.keys(): | |
self.results[cut][variable] = {} | |
def splitSubsamples(self): | |
""" | |
Split samples into subsamples if needed | |
After this method the ``dfs`` attribute will be modified to contain the subsamples names instead of the original sample name | |
""" | |
sampleNames = set( | |
list(map(lambda k: k[0], list(filter(lambda k: len(k) == 6, self.samples)))) | |
) | |
for sampleName in sampleNames: | |
_sample = list(filter(lambda k: k[0] == sampleName, self.samples))[0] | |
for subsample in list(_sample[5].keys()): | |
self.dfs[sampleName + "_" + subsample] = {} | |
for index in self.dfs[sampleName].keys(): | |
self.dfs[sampleName + "_" + subsample][index] = { | |
"parent": sampleName | |
} | |
self.dfs[sampleName + "_" + subsample][index]["df"] = self.dfs[ | |
sampleName | |
][index]["df"].Filter(_sample[5][subsample]) | |
self.dfs[sampleName + "_" + subsample][index][ | |
"columnNames" | |
] = self.dfs[sampleName][index]["columnNames"] | |
self.dfs[sampleName + "_" + subsample][index]["ttree"] = self.dfs[ | |
sampleName | |
][index]["ttree"] | |
del self.dfs[sampleName] | |
def create_cuts_vars(self): | |
""" | |
Defines ``Histo1D`` for each variable and cut and dataframe. It also creates dictionary for variations through ``VariationsFor()`` | |
""" | |
mergedCuts = self.cuts | |
variables = self.variables | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
df = self.dfs[sampleName][index]["df"] | |
for cut in mergedCuts.keys(): | |
df_cat = df.Filter(mergedCuts[cut]["expr"]) | |
opts = ROOT.RDF.RSnapshotOptions() | |
opts.fLazy = True | |
for var in list(variables.keys()): | |
if "tree" in variables[var].keys(): | |
if not mergedCuts[cut]["parent"] in variables[var]["cuts"]: | |
# del df_cat | |
continue | |
outpath = self.outputFileMap | |
if self.outputFileMap != "output.root": | |
outfileName = ( | |
".".join( | |
self.outputFileMap.split("/")[-1].split(".")[ | |
:-1 | |
] | |
) | |
+ f"__ALL__{cut}_{sampleName}_{str(index)}.root" | |
) | |
outpath = ( | |
"/".join(self.outputFileMap.split("/")[:-1]) | |
+ "/" | |
+ outfileName | |
) | |
else: | |
outpath = f"output_{cut}_{sampleName}_{str(index)}.root" | |
# _h = df_cat.Snapshot(cut + '/' + sampleName, outpath, list(self.variables[var]['tree'].keys()), opts) | |
# FIXME always save the weight! | |
_h = df_cat.Snapshot( | |
"Events", | |
outpath, | |
list(self.variables[var]["tree"].keys()) + ["weight"], | |
opts, | |
) | |
else: | |
vs = variables[var]["name"].split(":") | |
histRange = [] | |
if len(variables[var]["range"]) == len(vs): | |
# bin endges are provided by the user | |
for ax in variables[var]["range"]: | |
histRange.extend([len(ax) - 1, array("d", ax)]) | |
else: | |
histRange = variables[var]["range"] | |
histRange = tuple(histRange) | |
if len(vs) == 1: | |
_h = df_cat.Histo1D( | |
(cut + "_" + var, "") + histRange, var, "weight" | |
) | |
elif len(vs) == 2: | |
varNames = [] | |
for i, _var in enumerate(vs): | |
n = var if i == 0 else var + f"_{i}" | |
varNames.append(n) | |
_h = df_cat.Histo2D( | |
(cut + "_" + var, "") + histRange, | |
*varNames, | |
"weight", | |
) | |
else: | |
print("Unknown dimension of histo for variable", var) | |
sys.exit() | |
if sampleName not in self.results[cut][var].keys(): | |
self.results[cut][var][sampleName] = {} | |
self.results[cut][var][sampleName][index] = _h | |
for cut in mergedCuts.keys(): | |
for var in list(variables.keys()): | |
if "tree" not in variables[var].keys(): | |
_s = self.results[cut][var][sampleName][index] | |
_s_var = ROOT.RDF.Experimental.VariationsFor(_s) | |
self.results[cut][var][sampleName][index] = _s_var | |
def convertResults(self): | |
""" | |
Gather resulting histograms and fold them if needed. | |
Systematics are also saved. | |
""" | |
mergedCuts = self.cuts | |
variables = self.variables | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
for cut in mergedCuts.keys(): | |
for var in list(variables.keys()): | |
if "tree" in variables[var].keys(): | |
# no need to process SnapShots | |
continue | |
_s_var = self.results[cut][var][sampleName][index] | |
_histos = {} | |
# _histos_special_nuis = {} | |
for _variation in list(map(lambda k: str(k), _s_var.GetKeys())): | |
_h_name = _variation.replace(":", "") | |
_h = 0 | |
_h = _s_var[_variation] | |
fold = variables[var].get("fold", 0) | |
if isinstance(_h, ROOT.TH1D): | |
if fold == 1 or fold == 3: | |
_from = 0 | |
_to = 1 | |
_fold(_h, _to, _from) | |
if fold == 2 or fold == 3: | |
lastBin = _h.GetNbinsX() | |
_from = lastBin + 1 | |
_to = lastBin | |
_fold(_h, _to, _from) | |
elif isinstance(_h, ROOT.TH2D): | |
nx = _h.GetNbinsX() | |
ny = _h.GetNbinsY() | |
if fold == 1 or fold == 3: | |
for i in range(0, ny+2): | |
_from = 0 | |
_to = 0 | |
if i == 0: | |
_from = _h.GetBin(0, i) | |
_to = _h.GetBin(1, i+1) | |
elif i == ny+1: | |
_from = _h.GetBin(0, i) | |
_to = _h.GetBin(1, i-1) | |
else: | |
_from = _h.GetBin(0, i) | |
_to = _h.GetBin(1, i) | |
_fold(_h, _to, _from) | |
for i in range(0, nx+2): | |
_from = 0 | |
_to = 0 | |
if i == 0: | |
_from = _h.GetBin(i, 0) | |
_to = _h.GetBin(i+1, 1) | |
elif i == ny+1: | |
_from = _h.GetBin(i, 0) | |
_to = _h.GetBin(i-1, 1) | |
else: | |
_from = _h.GetBin(i, 0) | |
_to = _h.GetBin(i, 1) | |
_fold(_h, _to, _from) | |
if fold == 2 or fold == 3: | |
for i in range(0, ny+2): | |
_from = 0 | |
_to = 0 | |
if i == 0: | |
_from = _h.GetBin(nx+1, i) | |
_to = _h.GetBin(nx , i+1) | |
elif i == ny+1: | |
_from = _h.GetBin(nx+1, i) | |
_to = _h.GetBin(nx , i-1) | |
else: | |
_from = _h.GetBin(nx+1, i) | |
_to = _h.GetBin(nx , i) | |
_fold(_h, _to, _from) | |
for i in range(0, nx+2): | |
_from = 0 | |
_to = 0 | |
if i == 0: | |
_from = _h.GetBin(i, nx+1) | |
_to = _h.GetBin(i+1, nx) | |
elif i == ny+1: | |
_from = _h.GetBin(i, nx+1) | |
_to = _h.GetBin(i-1, nx) | |
else: | |
_from = _h.GetBin(i, nx+1) | |
_to = _h.GetBin(i, nx) | |
_fold(_h, _to, _from) | |
#unroll 2d | |
_name = _h.GetName() | |
_h.SetName(_name + "_old") | |
new_name = _name + "_" + _h_name + "_" + sampleName + "_" + str(index) | |
# print('creating histo with name', new_name) | |
_h2 = ROOT.TH1D(new_name, _h.GetName(), nx*ny, 1, nx*ny) | |
for i in range(1, nx+1): | |
for j in range(1, ny+1): | |
new_ind = (i-1) * ny + j | |
old_ind = _h.GetBin(i,j) | |
_h2.SetBinContent(new_ind, _h.GetBinContent(old_ind)) | |
_h2.SetBinError(new_ind, _h.GetBinError(old_ind)) | |
del _h | |
_h = _h2 | |
_histos[_h_name] = _h#.Clone() | |
self.results[cut][var][sampleName][index] = _histos | |
def saveResults(self): | |
""" | |
Save results in a root file. | |
If ``Snapshot`` were created will merge them in a output file. | |
""" | |
f = ROOT.TFile(self.outputFileMap, "recreate") | |
toBeDeleted = [] | |
openedFiles = [] | |
openedTrees = [] | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
for cut in self.cuts.keys(): | |
for var in list(self.variables.keys()): | |
if "tree" in self.variables[var].keys(): | |
if ( | |
not self.cuts[cut]["parent"] | |
in self.variables[var]["cuts"] | |
): | |
continue | |
if self.outputFileMap != "output.root": | |
outfileName = ( | |
".".join( | |
self.outputFileMap.split("/")[-1].split(".")[ | |
:-1 | |
] | |
) | |
+ f"__ALL__{cut}_{sampleName}_{str(index)}.root" | |
) | |
outpath = ( | |
"/".join(self.outputFileMap.split("/")[:-1]) | |
+ "/" | |
+ outfileName | |
) | |
else: | |
outpath = f"output_{cut}_{sampleName}_{str(index)}.root" | |
_f = ROOT.TFile(outpath) | |
openedFiles.append(_f) | |
if _f.GetListOfKeys().Contains("Events"): | |
t = _f.Get("Events") | |
openedTrees.append(t) | |
f.cd("/") | |
folder = f"trees/{cut}/{sampleName}/" | |
# print("Creating dir", folder, outpath) | |
toBeDeleted.append(outpath) | |
ROOT.gDirectory.mkdir(folder, "", True) | |
ROOT.gDirectory.cd("/" + folder) | |
t.CloneTree().Write() | |
else: | |
print( | |
"No TTree was created for", | |
cut, | |
var, | |
sampleName, | |
index, | |
"even if requested", | |
file=sys.stderr, | |
) | |
f.cd("/") | |
for cut_cat in list(self.results.keys()): | |
_cut_cat = f.mkdir(cut_cat) | |
for var in list(self.results[cut_cat].keys()): | |
if "tree" in self.variables[var].keys(): | |
continue | |
_var = "" | |
if var in list(self.remappedVariables.keys()): | |
# remove the __ at the beginning | |
_var = var[len(self.remappedVariables[var]) :] | |
else: | |
_var = var | |
_cut_cat.mkdir(_var) | |
f.cd("/" + cut_cat + "/" + _var) | |
for sampleName in list(self.results[cut_cat][var].keys()): | |
# should first merge histos | |
mergedHistos = {} | |
for index in list(self.results[cut_cat][var][sampleName].keys()): | |
for hname in list( | |
self.results[cut_cat][var][sampleName][index].keys() | |
): | |
if hname not in mergedHistos.keys(): | |
mergedHistos[hname] = self.results[cut_cat][var][ | |
sampleName | |
][index][hname].Clone() | |
else: | |
mergedHistos[hname].Add( | |
self.results[cut_cat][var][sampleName][index][hname] | |
) | |
for hname in mergedHistos.keys(): | |
if hname == "nominal": | |
_string = "histo_" + sampleName | |
mergedHistos[hname].SetName(_string) | |
mergedHistos[hname].SetTitle(_string) | |
else: | |
_string = "histo_" + sampleName + "_" + hname | |
mergedHistos[hname].SetName(_string) | |
mergedHistos[hname].SetTitle(_string) | |
mergedHistos[hname].Write() | |
f.Close() | |
# clean snapshot files | |
proc = "" | |
for _file in toBeDeleted: | |
proc += f" rm {_file}; " | |
import subprocess | |
_proc = subprocess.Popen(proc, shell=True) | |
_proc.wait() | |
# print(proc) | |
def mergeSaveResults(self): | |
f = ROOT.TFile(self.outputFileMap, "recreate") | |
for cut_cat in list(self.results.keys()): | |
_cut_cat = f.mkdir(cut_cat) | |
for var in list(self.results[cut_cat].keys()): | |
if "tree" in self.variables[var].keys(): | |
# no need to process SnapShots | |
continue | |
_cut_cat.mkdir(var) | |
f.cd("/" + cut_cat + "/" + var) | |
for sampleName in list(self.results[cut_cat][var].keys()): | |
# should first merge histos | |
mergedHistos = {} | |
for index in list(self.results[cut_cat][var][sampleName].keys()): | |
for hname in list( | |
self.results[cut_cat][var][sampleName][index].keys() | |
): | |
if hname not in mergedHistos.keys(): | |
mergedHistos[hname] = self.results[cut_cat][var][ | |
sampleName | |
][index][hname].Clone() | |
else: | |
mergedHistos[hname].Add( | |
self.results[cut_cat][var][sampleName][index][hname] | |
) | |
for hname in mergedHistos.keys(): | |
if hname == "nominal": | |
mergedHistos[hname].SetName("histo_" + sampleName) | |
else: | |
mergedHistos[hname].SetName( | |
"histo_" + sampleName + "_" + hname | |
) | |
mergedHistos[hname].Write() | |
f.Close() | |
def mergeAndSaveResults(self): | |
f = ROOT.TFile(self.outputFileMap, "recreate") | |
for cut_cat in list(self.results.keys()): | |
_cut_cat = f.mkdir(cut_cat) | |
for var in list(self.results[cut_cat].keys()): | |
if "tree" in self.variables[var].keys(): | |
# no need to process SnapShots | |
continue | |
_cut_cat.mkdir(var) | |
f.cd("/" + cut_cat + "/" + var) | |
for sampleName in list(self.results[cut_cat][var].keys()): | |
# should first merge histos | |
mergedHistos = {} | |
for index in list(self.results[cut_cat][var][sampleName].keys()): | |
for hname in list( | |
self.results[cut_cat][var][sampleName][index].keys() | |
): | |
if hname not in mergedHistos.keys(): | |
mergedHistos[hname] = self.results[cut_cat][var][ | |
sampleName | |
][index][hname].Clone() | |
else: | |
mergedHistos[hname].Add( | |
self.results[cut_cat][var][sampleName][index][hname] | |
) | |
for hname in mergedHistos.keys(): | |
if hname == "nominal": | |
mergedHistos[hname].SetName("histo_" + sampleName) | |
else: | |
mergedHistos[hname].SetName( | |
"histo_" + sampleName + "_" + hname | |
) | |
mergedHistos[hname].Write() | |
f.Close() | |
def run(self): | |
""" | |
Runs the analysis: | |
1. load the aliases without the ``afterNuis`` option | |
2. load the ``suffix`` systematics | |
3. load the alias ``weight`` | |
4. load the reweight systematics (they need the ``weight`` to be defined) | |
5. finally load the suffix systematics with the ``afterNuis`` option | |
After this important procedure it filters with ``preselection`` the many ``dfs``, loads systematics | |
loads ``variables``, creates the results dict, splits the samples, creates the cuts/var histos, runs the dataframes | |
and saves results. | |
""" | |
# load all aliases needed before nuisances of type suffix | |
self.loadAliases() | |
self.loadSystematicsSuffix() | |
# load alias weight needed before nuisances of type weight | |
self.loadAliasWeight() | |
self.loadSystematicsReweights() | |
self.loadSystematicsReweightsEnvelopeRMS() | |
# load all aliases remaining | |
self.loadAliases(True) | |
# apply preselections | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
self.dfs[sampleName][index]["df"] = self.dfs[sampleName][index][ | |
"df" | |
# ].Filter("(" + self.preselections + ") && abs(weight) > 0.0") | |
].Filter("(" + self.preselections + ")") | |
self.loadVariables() | |
self.loadBranches() | |
print("loaded all variables") | |
self.createResults() | |
print("created empty results dict") | |
#print(self.cuts) | |
#print(self.results) | |
self.splitSubsamples() | |
print("splitted samples") | |
self.create_cuts_vars() | |
print("created cuts") | |
""" | |
# FIXME RunGraphs can't handle results of VaraitionsFor, ask Vincenzo about it | |
# collect all the dataframes and run them | |
dfs = [] | |
for cut in self.cuts.keys(): | |
for var in self.variables.keys(): | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
# dfs.append(self.results[cut][var][sampleName][index]) | |
dfs.extend( | |
list(self.results[cut][var][sampleName][index].values())) | |
ROOT.RDF.RunGraphs(dfs) | |
""" | |
counts = [] | |
# register number of events in each df | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
counts.append(self.dfs[sampleName][index]["df"].Count()) | |
snapshots = [] | |
for cut in self.cuts.keys(): | |
for var in self.variables.keys(): | |
if ( | |
len(self.results[cut].get(var, [])) == 0 | |
or "tree" not in self.variables[var].keys() | |
): | |
# no snapshots for this combination of cut variable | |
continue | |
for sampleName in self.dfs.keys(): | |
for index in self.dfs[sampleName].keys(): | |
# dfs.append(self.results[cut][var][sampleName][index]) | |
snapshots.append(self.results[cut][var][sampleName][index]) | |
if len(snapshots) != 0: | |
ROOT.RDF.RunGraphs(snapshots) | |
for count in counts: | |
print("Number of events passing preselections:", count.GetValue()) | |
#print(self.results) | |
self.convertResults() | |
self.saveResults() | |
if __name__ == "__main__": | |
ROOT.gInterpreter.Declare('#include "headers.hh"') | |
exec(open("script.py").read()) | |
runner = RunAnalysis(samples, aliases, variables, cuts, nuisances, lumi) | |
runner.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment