giorgiopizz/runner.py

## runner.py
from copy import deepcopy
import sys
import ROOT
from array import array
from mkShapesRDF.lib.parse_cpp import ParseCpp
import os
from math import sqrt

ROOT.gROOT.SetBatch(True)
ROOT.TH1.SetDefaultSumw2(True)

import numpy as np


# def hist2array(h, overflow=False, copy=True):
#     shift = 0
#     if overflow:
#         shift = 1
#     d = []
#     for i in range(1 - shift, h.GetNbinsX() + 1 + shift):
#         d.append(h.GetBinContent(i))
#     d = np.array(d)
#     return d

def hist2array(h, flow=False, copy=True, include_sumw2=True):
    nx = h.GetNbinsX() + 2
    dtype=0
    if isinstance(h, ROOT.TH1D):
        dtype=np.double
    elif isinstance(h, ROOT.TH1F):
        dtype=np.float
    elif isinstance(h, ROOT.TH1I):
        dtype=np.int
    else:
        print('Histogram of type', h, 'is not supperted', file=sys.stderr)
        sys.exit(1)

    vals = np.ndarray((nx, ), dtype=dtype, buffer=h.GetArray())
    sumw2 = np.ndarray((nx, ), dtype=dtype, buffer=h.GetSumw2().GetArray())

    shift = 0
    if flow:
        shift = 1
    vals = vals[shift: nx-shift]
    sum2 = sumw2[shift: nx-shift]

    if copy:
        vals = vals.copy()
        sumw2 = sumw2.copy()
    if include_sumw2:
        return vals, sumw2
    else:
        return vals

def _fold(_h, _to, _from):
    _h.SetBinContent(_to, _h.GetBinContent(_to) + _h.GetBinContent(_from))
    sumw2 = _h.GetSumw2()
    #_h.SetBinError(_to, sqrt(_h.GetBinError(_to) **2 + _h.GetBinError(_from)**2))
    sumw2[_to] = sumw2[_to] + sumw2[_from]
    _h.SetBinContent(_from, 0.0)
    sumw2[_from] = 0.0
    #_h.SetBinError(_from, 0.0)

class RunAnalysis:
    r"""Class athat craeates ``dfs`` and runs the analysiss"""

    @staticmethod
    def splitSamples(samples, useFilesPerJob=True):
        r"""static methods, takes a dictionary of samples and split them based on their weights and max num. of files

        Parameters
        ----------
            samples : dict
                dictionary of samples
            useFilesPerJob : bool, optional, default: True
                if you want to further split the samples based on max num. of files.

        Returns
        -------
            `list of tuple`
                each tuple will have a lenght of 5 (6 if subsamples are present), where the first element is the name of the sample, the second the list of files, the third the weight, and the fourth the index of this tuple compared to the other tuples of the same sample type, the fifth will be the isData flag (True if the sample is data, False otherwise). If subsamples are present, the sixth element will be the dict of subsamples
        """
        # will contain all the different samples splitted based on their weights and max num. of files
        splittedSamples = []
        for sampleName in list(samples.keys()):
            types = (
                {}
            )  # this will contain the the different type of files with their special weights for this sampleName
            # recall that samples[sampleName]['name'] contain a list of tuples, where a single tuple can have a size of 2 or 3:
            # first position for name of the process (DYJetsToLL_M-50), second list of files, third a possible special weight
            for filesType in samples[sampleName]["name"]:
                # no special weight (will use '1'), and at least one file found
                if len(filesType) == 2 and len(filesType[1]) > 0:
                    if "1" not in list(types.keys()):
                        types["1"] = []
                    types["1"].append(filesType + ("1.0",))
                elif len(filesType) == 3 and len(filesType[1]) > 0:
                    if filesType[2] not in list(types.keys()):
                        types[filesType[2]] = []
                    types[filesType[2]].append(filesType)
                else:
                    print("Error", sampleName, filesType, file=sys.stderr)
                    print(
                        "Either the sample proc you specified has no files, or the weight had problems",
                        file=sys.stderr,
                    )
                    sys.exit()

            i = 0
            for _type in list(types.keys()):
                # split files based on FilesPerJob or include them all
                __files = list(map(lambda k: k[1], types[_type]))
                # flatted list of files
                __files = [item for sublist in __files for item in sublist]
                if useFilesPerJob:
                    dim = samples[sampleName].get("FilesPerJob", len(__files))
                else:
                    dim = len(__files)
                __files = [__files[j : j + dim] for j in range(0, len(__files), dim)]
                for ___files in __files:
                    # the weights for these files will be the product of the weight inside this sampele (i.e. samples[sampleName]['weight'])
                    # and the product of the special weights that is in common to all of those files (i.e. sampleType[0])
                    # the common special weight can be retrived from the first of the list of files with this weight
                    # remember that the tuple has always size 3 now, the last position is for the special weight
                    weight = (
                        "( "
                        + samples[sampleName]["weight"]
                        + " ) * ( "
                        + types[_type][0][2]
                        + " )"
                    )
                    isData = samples[sampleName].get("isData", False)
                    sampleType = (sampleName, ___files, weight, i, isData)
                    if "subsamples" in list(samples[sampleName].keys()):
                        sampleType += (samples[sampleName]["subsamples"],)
                    splittedSamples.append(sampleType)
                    i += 1
        return splittedSamples

    @staticmethod
    def getTTreeNomAndFriends(fnom, friends):
        """Create a TChain with the nominal files and the friends files (nuisances TTrees with varied branches)

        Args:
            fnom (list): list of nominal files
            friends (list of list): list of list of friends files

        Returns:
            TChain: TChain with nominal files and friends files
        """
        tnom = ROOT.TChain("Events")
        for f in fnom:
            tnom.Add(f)
        for friend in friends:
            _tfriend = ROOT.TChain("Events")
            for f in friend:
                _tfriend.Add(f)
            tnom.AddFriend(_tfriend)
        return tnom

    @staticmethod
    def getNuisanceFiles(nuisance, files):
        """Searches in the provided nuisance folder for the files with the same name of the nominal files

        Args:
            nuisance (dict): dict with the nuisance information
            files (list): list of nominal files

        Returns:
            list of list: list with the down and up varied list of files
        """
        _files = list(map(lambda k: k.split("/")[-1], files))
        nuisanceFilesDown = list(
            map(lambda k: nuisance["folderDown"] + "/" + k, _files)
        )
        nuisanceFilesUp = list(map(lambda k: nuisance["folderUp"] + "/" + k, _files))
        for nuisFile in nuisanceFilesUp + nuisanceFilesDown:
            # print(nuisFile)
            if not os.path.exists(nuisFile):
                print("File", nuisFile, "does not exist", file=sys.stderr)
                sys.exit(1)
        return [nuisanceFilesDown, nuisanceFilesUp]

    @staticmethod
    def index_sub(string, sub):
        try:
            return string.index(sub)
        except ValueError:
            return -10000

    def __init__(
        self,
        samples,
        aliases,
        variables,
        cuts,
        nuisances,
        lumi,
        limit=-1,
        outputFileMap="output.root",
    ):
        r"""
        Stores arguments in the class attributes and creates all the RDataFrame objects

        Parameters
        ----------

            samples : `list of tuple`
                same type as the return of the splitSamples method
            aliases : dict
                dict of aliases
            variables : dict
                dict of variables
            cuts : dict
                dict of cuts, contains two keys (preselections: str, cuts: dict)
            nuisances : dict
                dict of nuisances
            lumi : float
                lumi in fb-1
            limit : int, optional, default: -1
                limit of events to be processed
            outputFileMap : str, optional, defaults: 'output.root'
                full path + filename of the output root file.

        Returns
        -------
            None

        """
        self.samples = samples
        self.aliases = aliases
        self.variables = variables
        self.preselections = cuts["preselections"]
        mergedCuts = {}
        for cut in list(cuts["cuts"].keys()):
            __cutExpr = ""
            if type(cuts["cuts"][cut]) == dict:
                __cutExpr = cuts["cuts"][cut]["expr"]
                for cat in list(cuts["cuts"][cut]["categories"].keys()):
                    mergedCuts[cut + "_" + cat] = {"parent": cut}
                    mergedCuts[cut + "_" + cat]["expr"] = (
                        __cutExpr + " && " + cuts["cuts"][cut]["categories"][cat]
                    )
            elif type(cuts["cuts"][cut]) == str:
                __cutExpr = cuts["cuts"][cut]
                mergedCuts[cut] = {"expr": __cutExpr, "parent": cut}
        self.cuts = mergedCuts
        self.nuisances = nuisances
        self.lumi = lumi
        self.limit = limit
        self.outputFileMap = outputFileMap
        self.remappedVariables = {}

        self.dfs = {}
        r"""
        dfs is a dictionary containing as keys the sampleNames. The structure should look like this:

        ..  code-block:: python

            dfs = {
                'DY':{
                    0: {
                        'df': obj,
                        'columnNames': [...],
                        'usedVariables': [...],
                        'ttree': obj2, # needed otherwise seg fault in root
                    },
                }
            }

        """

        usedVariables = set()
        usedVariables = usedVariables.union(
            ParseCpp.listOfVariables(ParseCpp.parse(self.preselections))
        )
        for cut in mergedCuts.keys():
            usedVariables = usedVariables.union(
                ParseCpp.listOfVariables(ParseCpp.parse(mergedCuts[cut]["expr"]))
            )
        for var in variables.keys():
            if "name" in variables[var].keys():
                __var = variables[var]["name"].split(":")
                __vars = list(
                    map(lambda k: ParseCpp.listOfVariables(ParseCpp.parse(k)), __var)
                )
                for __v in __vars:
                    usedVariables = usedVariables.union(__v)

            elif "tree" in variables[var].keys():
                for branch in variables[var]["tree"].keys():
                    usedVariables = usedVariables.union(
                        ParseCpp.listOfVariables(
                            ParseCpp.parse(variables[var]["tree"][branch])
                        )
                    )
            else:
                print("Cannot process variable ", var, " nuisances might be faulty")

        # sample here is a tuple, first el is the sampleName, second list of files,
        # third the special weight, forth is the index of tuple for the same sample,
        # fifth if present the dict of subsamples
        for sample in samples:
            files = sample[1]
            sampleName = sample[0]
            friendsFiles = []
            usedFolders = []
            for nuisance in self.nuisances.values():
                if sampleName not in nuisance.get("samples", {sampleName: []}):
                    continue
                if nuisance.get("type", "") == "shape":
                    if nuisance.get("kind", "") == "suffix":
                        if nuisance.get("folderUp", "") != "":
                            if nuisance["folderUp"] in usedFolders:
                                continue
                            usedFolders.append(nuisance["folderUp"])

                            friendsFiles += RunAnalysis.getNuisanceFiles(
                                nuisance, files
                            )
            tnom = RunAnalysis.getTTreeNomAndFriends(files, friendsFiles)

            if limit != -1:
                df = ROOT.RDataFrame(tnom)
                df = df.Range(limit)
            else:
                #ROOT.EnableImplicitMT()
                df = ROOT.RDataFrame(tnom)
            if sampleName not in self.dfs.keys():
                self.dfs[sample[0]] = {}
            self.dfs[sampleName][sample[3]] = {
                "df": df,
                "ttree": tnom,
                "usedVariables": list(usedVariables),
            }
            self.dfs[sampleName][sample[3]]["columnNames"] = list(
                map(lambda k: str(k), df.GetColumnNames())
            )

        self.definedAliases = {}

        print("\n\nLoaded dataframes\n\n")

    def loadAliases(self, afterNuis=False):
        """
        Load aliases in the dataframes. It does not create the special alias ``weight`` for which a special method is used.

        Parameters
        ----------
            afterNuis : bool, optional, default: False
                if True, only aliases with the key ``afterNuis`` set to True will be loaded

        """
        aliases = self.aliases
        for sampleName in self.dfs.keys():
            for index in self.dfs[sampleName].keys():
                df = self.dfs[sampleName][index]["df"]

                define_string = "df"
                usedVariables = set(self.dfs[sampleName][index]["usedVariables"])
                for alias in list(aliases.keys()):
                    # only load aliases needed for nuisances!
                    # if beforeNuis
                    if (afterNuis) != (aliases[alias].get("afterNuis", False)):
                        if "expr" in aliases[alias]:
                            usedVariables = usedVariables.union(
                                ParseCpp.listOfVariables(
                                    ParseCpp.parse(aliases[alias]["expr"])
                                )
                            )
                        continue

                    if "samples" in list(aliases[alias]):
                        if sampleName not in aliases[alias]["samples"]:
                            continue
                    if alias in self.dfs[sampleName][index]["columnNames"]:
                        if alias != aliases[alias].get("expr", ""):
                            print(
                                f"Alias {alias} cannot be defined, column with that name already exists"
                            )
                            sys.exit()
                        else:
                            # since the alias key is equal to expr there's no need to keep the alias
                            del self.aliases[alias]
                            continue

                    for line in aliases[alias].get("linesToAdd", []):
                        # RPLME_nThreads is used to set the number of threads
                        ROOT.gInterpreter.Declare(
                            line.replace("RPLME_nThreads", str(df.GetNSlots()))
                        )

                    for line in aliases[alias].get("linesToProcess", []):
                        # RPLME_nThreads is used to set the number of threads
                        exec(line, globals())

                    if "expr" in list(aliases[alias].keys()):
                        define_string += (
                            f".Define('{alias}', '{aliases[alias]['expr']}') \\\n\t"
                        )
                        usedVariables = usedVariables.union(
                            ParseCpp.listOfVariables(
                                ParseCpp.parse(aliases[alias]["expr"])
                            )
                        )

                    elif "exprSlot" in list(aliases[alias].keys()):
                        args = aliases[alias]["exprSlot"]
                        args[0] = args[0].replace("RPLME_nThreads", str(df.GetNSlots()))
                        define_string += (
                            f".DefineSlot('{alias}', '{args[0]}', [ {args[1]} ]) \\\n\t"
                        )
                        usedVariables = usedVariables.union(
                            ParseCpp.listOfVariables(
                                ParseCpp.parse(aliases[alias]["exprSlot"][1])
                            )
                        )

                    elif "class" in list(aliases[alias].keys()):
                        define_string += f".Define('{alias}', '{aliases[alias]['class']} ( {aliases[alias].get('args', '')}  )') \\\n\t"

                df1 = eval(define_string)
                self.dfs[sampleName][index]["df"] = df1

                stringPrint = "before nuisances"
                if afterNuis:
                    stringPrint = "after nuisances"
                print(
                    f"\n\nLoaded all aliases {stringPrint} for {sampleName} index {index}\n\n"
                )

                self.dfs[sampleName][index]["usedVariables"] = list(usedVariables)

                self.dfs[sampleName][index]["columnNames"] = list(
                    map(lambda k: str(k), df1.GetColumnNames())
                )

    def loadAliasWeight(self):
        """
        Loads only the special alias ``weight`` in the dataframes.
        """
        samples = self.samples
        aliases = self.aliases
        for sampleName in self.dfs.keys():
            for index in self.dfs[sampleName].keys():
                df = self.dfs[sampleName][index]["df"]  # noqa: F841

                # Define the most importante alias: the weight!
                sample = list(
                    filter(lambda k: k[0] == sampleName and k[3] == index, samples)
                )[0]
                weight = sample[2]
                isData = sample[4]

                if not isData:
                    aliases["weight"] = {"expr": str(self.lumi) + " * " + weight}
                else:
                    aliases["weight"] = {"expr": weight}

                print("\n\n", sampleName, "\n\n", aliases["weight"])

                # load the alias weight
                define_string = "df"

                alias = "weight"

                if alias in self.dfs[sampleName][index]["columnNames"]:
                    print(
                        f"Alias {alias} cannot be defined, column with that name already exists"
                    )
                    sys.exit()

                define_string += (
                    f".Define('{alias}', '{aliases[alias]['expr']}') \\\n\t"
                )

                df1 = eval(define_string)
                self.dfs[sampleName][index]["df"] = df1

                print(f"\n\nLoaded alias weight for {sampleName} index {index}\n\n")

                self.dfs[sampleName][index]["columnNames"].append("weight")

    def loadSystematicsSuffix(self):
        """
        Loads systematics of type ``suffix`` in the dataframes.
        """
        for sampleName in self.dfs.keys():
            for index in self.dfs[sampleName].keys():
                df = self.dfs[sampleName][index]["df"]
                columnNames = self.dfs[sampleName][index]["columnNames"]
                nuisances = self.nuisances
                # nuisance key is not used
                for _, nuisance in list(nuisances.items()):
                    if sampleName not in nuisance.get("samples", {sampleName: []}):
                        continue
                    if nuisance.get("type", "") == "shape":
                        if nuisance.get("kind", "") == "suffix":
                            variation = nuisance["mapDown"]
                            separator = nuisance.get("separator", "_")
                            variedCols = list(
                                filter(lambda k: k.endswith(variation), columnNames)
                            )
                            if len(variedCols) == 0:
                                print(f"No varied columns for {variation}")
                                sys.exit()
                            # if 'JER' in nuisance['name']:
                            #     print(variedCols)
                            baseCols = list(
                                map(
                                    lambda k: k[
                                        RunAnalysis.index_sub(k, "Events.")
                                        + len("Events.") : -len(separator + variation)
                                    ]
                                    if "Events" in k
                                    else k[: -len(separator + variation)],
                                    variedCols,
                                )
                            )
                            for baseCol in baseCols:
                                if (
                                    baseCol
                                    not in self.dfs[sampleName][index]["usedVariables"]
                                ):
                                    # baseCol is never used -> useless to register variation
                                    print("unused variable", baseCol)
                                    continue
                                print(baseCol in [str(k) for k in df.GetColumnNames()])
                                if not (baseCol in [str(k) for k in df.GetColumnNames()]):
                                    continue

                                if "bool" not in str(df.GetColumnType(baseCol)).lower():
                                    varNameDown = (
                                        baseCol
                                        + separator
                                        + nuisance["mapDown"]
                                        + "*"
                                        + nuisance["samples"][sampleName][1]
                                    )
                                    varNameUp = (
                                        baseCol
                                        + separator
                                        + nuisance["mapUp"]
                                        + "*"
                                        + nuisance["samples"][sampleName][0]
                                    )
                                else:
                                    varNameDown = (
                                        baseCol + separator + nuisance["mapDown"]
                                    )
                                    varNameUp = baseCol + separator + nuisance["mapUp"]

                                _type = df.GetColumnType(baseCol)
                                expr = (
                                    ParseCpp.RVecExpression(_type)
                                    + "{"
                                    + f"{varNameDown}, {varNameUp}"
                                    + "}"
                                )
                                if "JER" in nuisance["name"]:
                                    print("JER expr", expr)
                                df = df.Vary(
                                    baseCol,
                                    expr,
                                    variationTags=["Down", "Up"],
                                    variationName=nuisance["name"],
                                )

                        else:
                            continue
                        # else:
                        #     print("Unsupported nuisance")
                        #     sys.exit()
                self.dfs[sampleName][index]["df"] = df

    def loadSystematicsReweights(self):
        """
        Loads systematics of type ``suffix`` in the dataframes.
        """
        for sampleName in self.dfs.keys():
            for index in self.dfs[sampleName].keys():
                df = self.dfs[sampleName][index]["df"]
                columnNames = self.dfs[sampleName][index]["columnNames"]
                nuisances = self.nuisances
                # nuisance key is not used
                for _, nuisance in list(nuisances.items()):
                    if sampleName not in nuisance.get("samples", {sampleName: []}):
                        continue
                    if nuisance.get("type", "") == "shape":
                        if nuisance.get("kind", "") == "weight":
                            weights = nuisance["samples"].get(sampleName, None)
                            if weights is not None:
                                variedNames = []
                                if weights[0] not in columnNames:
                                    df = df.Define(nuisance["name"] + "_up", weights[0])
                                    variedNames.append(nuisance["name"] + "_up")
                                else:
                                    variedNames.append(weights[0])

                                if weights[1] not in columnNames:
                                    df = df.Define(
                                        nuisance["name"] + "_down", weights[1]
                                    )
                                    variedNames.append(nuisance["name"] + "_down")
                                else:
                                    variedNames.append(weights[1])

                                if df.GetColumnType("weight") == "double":
                                    expr = (
                                        "ROOT::RVecD"
                                        + "{ weight * (double)"
                                        + f"{variedNames[1]}, weight * (double) {variedNames[0]}"
                                        + "}"
                                    )
                                elif df.GetColumnType("weight") == "float":
                                    expr = (
                                        "ROOT::RVecF"
                                        + "{ weight * (float)"
                                        + f"{variedNames[1]}, weight * (float) {variedNames[0]}"
                                        + "}"
                                    )
                                else:
                                    print(
                                        "Weight column has unknown type:",
                                        df.GetColumnType("weight"),
                                        "while varied is: ",
                                        df.GetColumnType(variedNames[1]),
                                    )
                                    sys.exit()

                                df = df.Vary(
                                    "weight",
                                    expr,
                                    variationTags=["Down", "Up"],
                                    variationName=nuisance["name"],
                                )
                        else:
                            continue
                        # else:
                        #     print("Unsupported nuisance")
                        #     sys.exit()
                self.dfs[sampleName][index]["df"] = df

    def loadSystematicsReweightsEnvelopeRMS(self):
        """
        Loads systematics of type ``weight_envelope`` or type ``weight_rms`` in the dataframes.
        """
        for sampleName in self.dfs.keys():
            for index in self.dfs[sampleName].keys():
                df = self.dfs[sampleName][index]["df"]
                columnNames = self.dfs[sampleName][index]["columnNames"]
                nuisances = self.nuisances
                # nuisance key is not used
                for _, nuisance in list(nuisances.items()):
                    if sampleName not in nuisance.get("samples", {sampleName: []}):
                        continue
                    if nuisance.get("type", "") == "shape":
                        if (
                            nuisance.get("kind", "") == "weight_envelope"
                            or nuisance.get("kind", "") == "weight_rms"
                            or nuisance.get("kind", "") == "weight_square"
                        ):
                            weights = nuisance["samples"].get(sampleName, None)
                            if weights is not None:
                                variedNames = []
                                variedTags = []
                                for i, weight in enumerate(weights):
                                    if weight not in columnNames:
                                        nuisName = f'{nuisance["name"]}_{i}'
                                        df = df.Define(nuisName, weight)
                                        variedNames.append(nuisName)
                                        variedTags.append(str(i))
                                    else:
                                        variedNames.append(weight)
                                        variedTags.append(str(i))

                                if df.GetColumnType("weight") == "double":
                                    expr = (
                                        "ROOT::RVecD"
                                        + "{ "
                                        + ", ".join(
                                            [
                                                f"weight * (double) {variedName}"
                                                for variedName in variedNames
                                            ]
                                        )
                                        + "}"
                                    )
                                elif df.GetColumnType("weight") == "float":
                                    expr = (
                                        "ROOT::RVecF"
                                        + "{ "
                                        + ", ".join(
                                            [
                                                f"weight * (float) {variedName}"
                                                for variedName in variedNames
                                            ]
                                        )
                                        + "}"
                                    )
                                else:
                                    print(
                                        "Weight column has unknown type:",
                                        df.GetColumnType("weight"),
                                        "while varied is: ",
                                        df.GetColumnType(variedNames[0]),
                                    )
                                    sys.exit()

                                df = df.Vary(
                                    "weight",
                                    expr,
                                    variationTags=variedTags,
                                    variationName=nuisance["name"]
                                    + "_SPECIAL_NUIS_"
                                    + nuisance["kind"].split("_")[-1],
                                )
                        else:
                            continue
                self.dfs[sampleName][index]["df"] = df

    def loadVariables(self):
        """
        Loads variables (not the ones with the 'tree' key in them),
        and checks if they are already in the dataframe columns,
        if so it adds ``__`` at the beginning of the name.

        Since variables are shared but not the aliases,
        it could happen that a variable's name or expression is already defined
        for a given df but not for another one ->
        need to determine a common and compatible set of variables for all the many dfs.

        This is done by gathering the largest set of column names.
        """

        bigColumnNames = set()
        for sampleName in self.dfs.keys():
            for index in self.dfs[sampleName].keys():
                columnNames = self.dfs[sampleName][index]["columnNames"]
                bigColumnNames = bigColumnNames.union(set(columnNames))

        for sampleName in self.dfs.keys():
            for index in self.dfs[sampleName].keys():
                for var in list(self.variables.keys()):
                    if "tree" in self.variables[var].keys():
                        continue
                    if var in bigColumnNames and var != self.variables[var]["name"]:
                        # here I want to define a variable whose key is already in the column name list
                        # and its expression is different from its name
                        # therefore we will either create a Define or an Alias

                        # need to rename the new variable!
                        prefix = "__"
                        nVar = prefix + var
                        while nVar in bigColumnNames:
                            prefix += "__"
                            nVar = prefix + var
                        # print("changing variable", var, "to " + nVar)
                        self.remappedVariables[nVar] = prefix
                        self.variables[nVar] = deepcopy(self.variables[var])
                        del self.variables[var]

        for sampleName in self.dfs.keys():
            for index in self.dfs[sampleName].keys():
                df = self.dfs[sampleName][index]["df"]
                for var in list(self.variables.keys()):
                    if "tree" in self.variables[var].keys():
                        continue
                    for i, _var in enumerate(self.variables[var]["name"].split(":")):
                        n = var if i == 0 else var + f"_{i}"

                        if _var not in bigColumnNames:
                            # the variable expr does not exist, create it
                            df = df.Define(n, _var)
                        elif n not in bigColumnNames:
                            # the variable expr exists in the df, but not the variable key
                            # use alias
                            df = df.Alias(n, _var)
                        elif n == _var and n in bigColumnNames:
                            # since the variable name and expression are equal and are already present in the df don't do anything
                            pass
                        else:
                            # FIXME
                            print("Error, cannot define variable")
                            sys.exit()
                self.dfs[sampleName][index]["df"] = df
                self.dfs[sampleName][index]["columnNames"] = list(
                    map(lambda k: str(k), df.GetColumnNames())
                )

    def loadBranches(self):
        """
        Loads branches (the ones specified in an ``alias`` with the ``tree`` key in them),
        and checks if they are already in the dataframe columns,
        if so it adds ``__`` at the beginning of the name.

        Since variables are shared but not the aliases,
        it could happen that a variable's name or expression is already defined
        for a given df but not for another one ->
        need to determine a common and compatible set of variables for all the many dfs.

        This is done by gathering the largest set of column names.
        """

        bigColumnNames = set()
        for sampleName in self.dfs.keys():
            for index in self.dfs[sampleName].keys():
                columnNames = self.dfs[sampleName][index]["columnNames"]
                bigColumnNames = bigColumnNames.union(set(columnNames))

        for sampleName in self.dfs.keys():
            for index in self.dfs[sampleName].keys():
                for var in list(self.variables.keys()):
                    if "tree" not in self.variables[var].keys():
                        continue
                    for branch in list(self.variables[var]["tree"].keys()):
                        if (
                            branch in bigColumnNames
                            and branch != self.variables[var]["tree"][branch]
                        ):
                            # here I want to define a variable whose key is already in the column name list
                            # and its expression is different from its name
                            # therefore we will either create a Define or an Alias

                            # need to rename the new variable!
                            prefix = "__"
                            nVar = prefix + branch
                            while nVar in bigColumnNames:
                                prefix += "__"
                                nVar = prefix + branch
                            # print("changing variable", branch, "to " + nVar)
                            self.remappedVariables[nVar] = prefix
                            self.variables[var]["tree"][nVar] = self.variables[var][
                                "tree"
                            ][branch]
                            del self.variables[var]["tree"][branch]

        for sampleName in self.dfs.keys():
            for index in self.dfs[sampleName].keys():
                df = self.dfs[sampleName][index]["df"]
                for var in list(self.variables.keys()):
                    if "tree" not in self.variables[var].keys():
                        continue
                    for branch in self.variables[var]["tree"].keys():
                        # print("working on defining branch", branch)
                        if self.variables[var]["tree"][branch] not in bigColumnNames:
                            # the variable expr does not exist, create it
                            # print("define the branch")
                            df = df.Define(branch, self.variables[var]["tree"][branch])
                        elif branch not in bigColumnNames:
                            # the variable expr exists in the df, but not the variable key
                            # use alias
                            # print("define an alias to the branch")
                            df = df.Alias(branch, self.variables[var]["tree"][branch])
                        elif (
                            branch == self.variables[var]["tree"][branch]
                            and branch in bigColumnNames
                        ):
                            # since the variable name and expression are equal and are already present in the df don't do anything
                            pass
                        else:
                            # FIXME
                            print("Error, cannot define variable")
                            sys.exit()
                self.dfs[sampleName][index]["df"] = df
                self.dfs[sampleName][index]["columnNames"] = list(
                    map(lambda k: str(k), df.GetColumnNames())
                )

    def createResults(self):
        """
        Create empty dictionary for results, will store all the different histos
        """
        self.results = {}
        for cut in self.cuts.keys():
            self.results[cut] = {}
            for variable in self.variables.keys():
                self.results[cut][variable] = {}

    def splitSubsamples(self):
        """
        Split samples into subsamples if needed

        After this method the ``dfs`` attribute will be modified to contain the subsamples names instead of the original sample name
        """
        sampleNames = set(
            list(map(lambda k: k[0], list(filter(lambda k: len(k) == 6, self.samples))))
        )
        for sampleName in sampleNames:
            _sample = list(filter(lambda k: k[0] == sampleName, self.samples))[0]
            for subsample in list(_sample[5].keys()):
                self.dfs[sampleName + "_" + subsample] = {}
                for index in self.dfs[sampleName].keys():
                    self.dfs[sampleName + "_" + subsample][index] = {
                        "parent": sampleName
                    }
                    self.dfs[sampleName + "_" + subsample][index]["df"] = self.dfs[
                        sampleName
                    ][index]["df"].Filter(_sample[5][subsample])
                    self.dfs[sampleName + "_" + subsample][index][
                        "columnNames"
                    ] = self.dfs[sampleName][index]["columnNames"]
                    self.dfs[sampleName + "_" + subsample][index]["ttree"] = self.dfs[
                        sampleName
                    ][index]["ttree"]

            del self.dfs[sampleName]

    def create_cuts_vars(self):
        """
        Defines ``Histo1D`` for each variable and cut and dataframe. It also creates dictionary for variations through ``VariationsFor()``
        """
        mergedCuts = self.cuts
        variables = self.variables
        for sampleName in self.dfs.keys():
            for index in self.dfs[sampleName].keys():
                df = self.dfs[sampleName][index]["df"]

                for cut in mergedCuts.keys():
                    df_cat = df.Filter(mergedCuts[cut]["expr"])
                    opts = ROOT.RDF.RSnapshotOptions()
                    opts.fLazy = True

                    for var in list(variables.keys()):
                        if "tree" in variables[var].keys():
                            if not mergedCuts[cut]["parent"] in variables[var]["cuts"]:
                                # del df_cat
                                continue

                            outpath = self.outputFileMap
                            if self.outputFileMap != "output.root":
                                outfileName = (
                                    ".".join(
                                        self.outputFileMap.split("/")[-1].split(".")[
                                            :-1
                                        ]
                                    )
                                    + f"__ALL__{cut}_{sampleName}_{str(index)}.root"
                                )
                                outpath = (
                                    "/".join(self.outputFileMap.split("/")[:-1])
                                    + "/"
                                    + outfileName
                                )
                            else:
                                outpath = f"output_{cut}_{sampleName}_{str(index)}.root"

                            # _h = df_cat.Snapshot(cut + '/' + sampleName, outpath, list(self.variables[var]['tree'].keys()), opts)

                            # FIXME always save the weight!
                            _h = df_cat.Snapshot(
                                "Events",
                                outpath,
                                list(self.variables[var]["tree"].keys()) + ["weight"],
                                opts,
                            )
                        else:
                            vs = variables[var]["name"].split(":")

                            histRange = []
                            if len(variables[var]["range"]) == len(vs):
                                # bin endges are provided by the user
                                for ax in variables[var]["range"]:
                                    histRange.extend([len(ax) - 1, array("d", ax)])
                            else:
                                histRange = variables[var]["range"]

                            histRange = tuple(histRange)

                            if len(vs) == 1:
                                _h = df_cat.Histo1D(
                                    (cut + "_" + var, "") + histRange, var, "weight"
                                )
                            elif len(vs) == 2:
                                varNames = []
                                for i, _var in enumerate(vs):
                                    n = var if i == 0 else var + f"_{i}"
                                    varNames.append(n)

                                _h = df_cat.Histo2D(
                                    (cut + "_" + var, "") + histRange,
                                    *varNames,
                                    "weight",
                                )
                            else:
                                print("Unknown dimension of histo for variable", var)
                                sys.exit()
                        if sampleName not in self.results[cut][var].keys():
                            self.results[cut][var][sampleName] = {}
                        self.results[cut][var][sampleName][index] = _h

                for cut in mergedCuts.keys():
                    for var in list(variables.keys()):
                        if "tree" not in variables[var].keys():
                            _s = self.results[cut][var][sampleName][index]
                            _s_var = ROOT.RDF.Experimental.VariationsFor(_s)
                            self.results[cut][var][sampleName][index] = _s_var

    def convertResults(self):
        """
        Gather resulting histograms and fold them if needed.

        Systematics are also saved.
        """
        mergedCuts = self.cuts
        variables = self.variables
        for sampleName in self.dfs.keys():
            for index in self.dfs[sampleName].keys():
                for cut in mergedCuts.keys():
                    for var in list(variables.keys()):
                        if "tree" in variables[var].keys():
                            # no need to process SnapShots
                            continue
                        _s_var = self.results[cut][var][sampleName][index]
                        _histos = {}
                        # _histos_special_nuis = {}
                        for _variation in list(map(lambda k: str(k), _s_var.GetKeys())):
                            _h_name = _variation.replace(":", "")
                            _h = 0
                            _h = _s_var[_variation]
                            fold = variables[var].get("fold", 0)

                            if isinstance(_h, ROOT.TH1D):

                                if fold == 1 or fold == 3:
                                    _from = 0
                                    _to = 1
                                    _fold(_h, _to, _from)

                                if fold == 2 or fold == 3:
                                    lastBin = _h.GetNbinsX()
                                    _from = lastBin + 1
                                    _to = lastBin
                                    _fold(_h, _to, _from)

                            elif isinstance(_h, ROOT.TH2D):
                                nx = _h.GetNbinsX()
                                ny = _h.GetNbinsY()

                                if fold == 1 or fold == 3:

                                    for i in range(0, ny+2):
                                        _from = 0
                                        _to = 0
                                        if i == 0:
                                            _from = _h.GetBin(0, i)
                                            _to   = _h.GetBin(1, i+1)
                                        elif i == ny+1:
                                            _from = _h.GetBin(0, i)
                                            _to   = _h.GetBin(1, i-1)
                                        else:
                                            _from = _h.GetBin(0, i)
                                            _to   = _h.GetBin(1, i)
                                        _fold(_h, _to, _from)

                                    for i in range(0, nx+2):
                                        _from = 0
                                        _to = 0
                                        if i == 0:
                                            _from = _h.GetBin(i, 0)
                                            _to   = _h.GetBin(i+1, 1)
                                        elif i == ny+1:
                                            _from = _h.GetBin(i, 0)
                                            _to   = _h.GetBin(i-1, 1)
                                        else:
                                            _from = _h.GetBin(i, 0)
                                            _to   = _h.GetBin(i, 1)
                                        _fold(_h, _to, _from)

                                if fold == 2 or fold == 3:

                                    for i in range(0, ny+2):
                                        _from = 0
                                        _to = 0
                                        if i == 0:
                                            _from = _h.GetBin(nx+1, i)
                                            _to   = _h.GetBin(nx  , i+1)
                                        elif i == ny+1:
                                            _from = _h.GetBin(nx+1, i)
                                            _to   = _h.GetBin(nx  , i-1)
                                        else:
                                            _from = _h.GetBin(nx+1, i)
                                            _to   = _h.GetBin(nx  , i)
                                        _fold(_h, _to, _from)

                                    for i in range(0, nx+2):
                                        _from = 0
                                        _to = 0
                                        if i == 0:
                                            _from = _h.GetBin(i, nx+1)
                                            _to   = _h.GetBin(i+1, nx)
                                        elif i == ny+1:
                                            _from = _h.GetBin(i, nx+1)
                                            _to   = _h.GetBin(i-1, nx)
                                        else:
                                            _from = _h.GetBin(i, nx+1)
                                            _to   = _h.GetBin(i, nx)
                                        _fold(_h, _to, _from)

                                #unroll 2d
                                _name = _h.GetName()
                                _h.SetName(_name + "_old")
                                new_name = _name + "_" + _h_name + "_" + sampleName + "_" + str(index)
                                # print('creating histo with name', new_name)
                                _h2 = ROOT.TH1D(new_name, _h.GetName(), nx*ny, 1, nx*ny)
                                for i in range(1, nx+1):
                                    for j in range(1, ny+1):
                                        new_ind = (i-1) * ny + j
                                        old_ind = _h.GetBin(i,j)
                                        _h2.SetBinContent(new_ind, _h.GetBinContent(old_ind))
                                        _h2.SetBinError(new_ind, _h.GetBinError(old_ind))
                                del _h
                                _h = _h2


                            _histos[_h_name] = _h#.Clone()

                        self.results[cut][var][sampleName][index] = _histos

    def saveResults(self):
        """
        Save results in a root file.

        If ``Snapshot`` were created will merge them in a output file.

        """

        f = ROOT.TFile(self.outputFileMap, "recreate")

        toBeDeleted = []
        openedFiles = []
        openedTrees = []
        for sampleName in self.dfs.keys():
            for index in self.dfs[sampleName].keys():
                for cut in self.cuts.keys():
                    for var in list(self.variables.keys()):
                        if "tree" in self.variables[var].keys():
                            if (
                                not self.cuts[cut]["parent"]
                                in self.variables[var]["cuts"]
                            ):
                                continue

                            if self.outputFileMap != "output.root":
                                outfileName = (
                                    ".".join(
                                        self.outputFileMap.split("/")[-1].split(".")[
                                            :-1
                                        ]
                                    )
                                    + f"__ALL__{cut}_{sampleName}_{str(index)}.root"
                                )
                                outpath = (
                                    "/".join(self.outputFileMap.split("/")[:-1])
                                    + "/"
                                    + outfileName
                                )
                            else:
                                outpath = f"output_{cut}_{sampleName}_{str(index)}.root"
                            _f = ROOT.TFile(outpath)
                            openedFiles.append(_f)

                            if _f.GetListOfKeys().Contains("Events"):
                                t = _f.Get("Events")
                                openedTrees.append(t)
                                f.cd("/")
                                folder = f"trees/{cut}/{sampleName}/"
                                # print("Creating dir", folder, outpath)
                                toBeDeleted.append(outpath)
                                ROOT.gDirectory.mkdir(folder, "", True)
                                ROOT.gDirectory.cd("/" + folder)
                                t.CloneTree().Write()
                            else:
                                print(
                                    "No TTree was created for",
                                    cut,
                                    var,
                                    sampleName,
                                    index,
                                    "even if requested",
                                    file=sys.stderr,
                                )

        f.cd("/")

        for cut_cat in list(self.results.keys()):
            _cut_cat = f.mkdir(cut_cat)
            for var in list(self.results[cut_cat].keys()):
                if "tree" in self.variables[var].keys():
                    continue
                _var = ""
                if var in list(self.remappedVariables.keys()):
                    # remove the __ at the beginning
                    _var = var[len(self.remappedVariables[var]) :]
                else:
                    _var = var

                _cut_cat.mkdir(_var)
                f.cd("/" + cut_cat + "/" + _var)
                for sampleName in list(self.results[cut_cat][var].keys()):
                    # should first merge histos
                    mergedHistos = {}
                    for index in list(self.results[cut_cat][var][sampleName].keys()):
                        for hname in list(
                            self.results[cut_cat][var][sampleName][index].keys()
                        ):
                            if hname not in mergedHistos.keys():
                                mergedHistos[hname] = self.results[cut_cat][var][
                                    sampleName
                                ][index][hname].Clone()
                            else:
                                mergedHistos[hname].Add(
                                    self.results[cut_cat][var][sampleName][index][hname]
                                )

                    for hname in mergedHistos.keys():
                        if hname == "nominal":
                            _string = "histo_" + sampleName
                            mergedHistos[hname].SetName(_string)
                            mergedHistos[hname].SetTitle(_string)
                        else:
                            _string = "histo_" + sampleName + "_" + hname
                            mergedHistos[hname].SetName(_string)
                            mergedHistos[hname].SetTitle(_string)
                        mergedHistos[hname].Write()
        f.Close()

        # clean snapshot files
        proc = ""
        for _file in toBeDeleted:
            proc += f" rm {_file}; "
        import subprocess

        _proc = subprocess.Popen(proc, shell=True)
        _proc.wait()

        # print(proc)

    def mergeSaveResults(self):
        f = ROOT.TFile(self.outputFileMap, "recreate")
        for cut_cat in list(self.results.keys()):
            _cut_cat = f.mkdir(cut_cat)
            for var in list(self.results[cut_cat].keys()):
                if "tree" in self.variables[var].keys():
                    # no need to process SnapShots
                    continue
                _cut_cat.mkdir(var)
                f.cd("/" + cut_cat + "/" + var)
                for sampleName in list(self.results[cut_cat][var].keys()):
                    # should first merge histos
                    mergedHistos = {}
                    for index in list(self.results[cut_cat][var][sampleName].keys()):
                        for hname in list(
                            self.results[cut_cat][var][sampleName][index].keys()
                        ):
                            if hname not in mergedHistos.keys():
                                mergedHistos[hname] = self.results[cut_cat][var][
                                    sampleName
                                ][index][hname].Clone()
                            else:
                                mergedHistos[hname].Add(
                                    self.results[cut_cat][var][sampleName][index][hname]
                                )

                    for hname in mergedHistos.keys():
                        if hname == "nominal":
                            mergedHistos[hname].SetName("histo_" + sampleName)
                        else:
                            mergedHistos[hname].SetName(
                                "histo_" + sampleName + "_" + hname
                            )
                        mergedHistos[hname].Write()
        f.Close()

    def mergeAndSaveResults(self):
        f = ROOT.TFile(self.outputFileMap, "recreate")
        for cut_cat in list(self.results.keys()):
            _cut_cat = f.mkdir(cut_cat)
            for var in list(self.results[cut_cat].keys()):
                if "tree" in self.variables[var].keys():
                    # no need to process SnapShots
                    continue
                _cut_cat.mkdir(var)
                f.cd("/" + cut_cat + "/" + var)
                for sampleName in list(self.results[cut_cat][var].keys()):
                    # should first merge histos
                    mergedHistos = {}
                    for index in list(self.results[cut_cat][var][sampleName].keys()):
                        for hname in list(
                            self.results[cut_cat][var][sampleName][index].keys()
                        ):
                            if hname not in mergedHistos.keys():
                                mergedHistos[hname] = self.results[cut_cat][var][
                                    sampleName
                                ][index][hname].Clone()
                            else:
                                mergedHistos[hname].Add(
                                    self.results[cut_cat][var][sampleName][index][hname]
                                )

                    for hname in mergedHistos.keys():
                        if hname == "nominal":
                            mergedHistos[hname].SetName("histo_" + sampleName)
                        else:
                            mergedHistos[hname].SetName(
                                "histo_" + sampleName + "_" + hname
                            )
                        mergedHistos[hname].Write()
        f.Close()

    def run(self):
        """
        Runs the analysis:

        1. load the aliases without the ``afterNuis`` option
        2. load the ``suffix`` systematics

        3. load the alias ``weight``
        4. load the reweight systematics (they need the ``weight`` to be defined)

        5. finally load the suffix systematics with the ``afterNuis`` option


        After this important procedure it filters with ``preselection`` the many ``dfs``, loads systematics
        loads ``variables``, creates the results dict, splits the samples, creates the cuts/var histos, runs the dataframes
        and saves results.
        """
        # load all aliases needed before nuisances of type suffix
        self.loadAliases()
        self.loadSystematicsSuffix()

        # load alias weight needed before nuisances of type weight
        self.loadAliasWeight()
        self.loadSystematicsReweights()
        self.loadSystematicsReweightsEnvelopeRMS()

        # load all aliases remaining
        self.loadAliases(True)

        # apply preselections
        for sampleName in self.dfs.keys():
            for index in self.dfs[sampleName].keys():
                self.dfs[sampleName][index]["df"] = self.dfs[sampleName][index][
                    "df"
                # ].Filter("(" + self.preselections + ") && abs(weight) > 0.0")
                ].Filter("(" + self.preselections + ")")

        self.loadVariables()
        self.loadBranches()
        print("loaded all variables")
        self.createResults()
        print("created empty results dict")
        #print(self.cuts)
        #print(self.results)
        self.splitSubsamples()
        print("splitted samples")
        self.create_cuts_vars()
        print("created cuts")

        """
        # FIXME RunGraphs can't handle results of VaraitionsFor, ask Vincenzo about it

        # collect all the dataframes and run them
        dfs = []
        for cut in self.cuts.keys():
            for var in self.variables.keys():
                for sampleName in self.dfs.keys():
                    for index in self.dfs[sampleName].keys():
                        # dfs.append(self.results[cut][var][sampleName][index])
                        dfs.extend(
                            list(self.results[cut][var][sampleName][index].values()))
        ROOT.RDF.RunGraphs(dfs)
        """

        counts = []
        # register number of events in each df
        for sampleName in self.dfs.keys():
            for index in self.dfs[sampleName].keys():
                counts.append(self.dfs[sampleName][index]["df"].Count())

        snapshots = []
        for cut in self.cuts.keys():
            for var in self.variables.keys():
                if (
                    len(self.results[cut].get(var, [])) == 0
                    or "tree" not in self.variables[var].keys()
                ):
                    # no snapshots for this combination of cut variable
                    continue

                for sampleName in self.dfs.keys():
                    for index in self.dfs[sampleName].keys():
                        # dfs.append(self.results[cut][var][sampleName][index])
                        snapshots.append(self.results[cut][var][sampleName][index])

        if len(snapshots) != 0:
            ROOT.RDF.RunGraphs(snapshots)

        for count in counts:
            print("Number of events passing preselections:", count.GetValue())
        #print(self.results)

        self.convertResults()
        self.saveResults()


if __name__ == "__main__":
    ROOT.gInterpreter.Declare('#include "headers.hh"')
    exec(open("script.py").read())
    runner = RunAnalysis(samples, aliases, variables, cuts, nuisances, lumi)
    runner.run()