aminnj/heputils.py

## heputils.py
import time
import numpy as np
import pandas as pd
import uproot
from yahist import Hist1D, Hist2D
from pandas.core.base import PandasObject
from cachetools import cached, LRUCache
from cachetools.keys import hashkey

def myhasher(*args, **kwargs):
    # pd.DataFrame disabled __hash__ because they are mutable
    hbins = kwargs.pop("bins", 0)
    if "ndarray" in str(type(hbins)):
        hbins = hash(tuple(hbins.tolist()))
    if type(hbins) in [list, tuple] and len(hbins) == 2:
        hbins = hash(map(tuple,hbins))
    return hashkey(args[0]._data, args[1:], hbins, **kwargs)
@cached(cache=LRUCache(maxsize=256), key=myhasher)
def tree_draw(df, varexp, sel="", **kwargs):
    from tokenize import tokenize, NAME, OP
    from io import BytesIO

    tokens = tokenize(BytesIO(f"{varexp}${sel}".encode("utf-8")).readline)
    nops = 0
    for x in tokens:
        toknum, tokval = x[:2]
        nops += (toknum == NAME) and (tokval in ["and", "or"])
        nops += toknum == OP
    fast = nops < 5
    twodim = ":" in varexp
    if fast and not twodim:
        raw = hacky_query_eval(df, varexp, sel)
        # performance hit for float16's. copy to float32 first.
        if raw.dtype in ["float16"]:
            vals = raw.astype("float32")
        else:
            vals = raw
        if "weights" in kwargs:
            raise Exception("`weights` kwarg not supported for `fast=True` (yet?)")
        return Hist1D(vals, **kwargs)
    if not sel:
        mask = slice(None)
    else:
        mask = df.eval(sel)
    if twodim:
        assert(varexp.count(":") == 1)
        varexp1, varexp2 = varexp.split(":")
        vals = np.c_[df[mask].eval(varexp1), df[mask].eval(varexp2)]
        return Hist2D(vals, **kwargs)
    else:
        vals = df[mask].eval(varexp)
        return Hist1D(vals, **kwargs)


PandasObject.draw = tree_draw


def read_root(filename, treename="t", columns=None, progress=False, **kwargs):
    """
    Read ROOT file containing one TTree into pandas DataFrame.
    Thin wrapper around `uproot.iterate`.

    filename: filename(s)/file pattern(s)
    treename: name of input TTree
    progress: show tqdm progress bar?
    columns: list of columns ("branches") to read (default of `None` reads all)
    **kwargs: extra kwargs to pass to `uproot.iterate`

    Passing `entrysteps=[(0, 10)]` will read the first 10 rows, for example.
    """
    # entrysteps of None iterates by basket to match `dataframe_to_ttree`
    iterable = uproot.iterate(
        filename,
        treename,
        columns,
        entrysteps=kwargs.pop("entrysteps", None),
        outputtype=dict,
        namedecode="ascii",
        **kwargs,
    )
    if progress:
        from tqdm.auto import tqdm

        iterable = tqdm(iterable)
    f = uproot.open(filename)
    categorical_columns = [k.decode().split("_",1)[1].rsplit(";",1)[0] for k in f.keys() if k.startswith(b"categories_")]
    def to_df(chunk):
        for column in list(chunk.keys()):
            vals = chunk[column]
            if (vals.dtype == "object") and (column+"_strn" in chunk.keys()):
                del chunk[column+"_strn"]
                import awkward
                chunk[column] = awkward.StringArray.fromjagged(vals.astype("uint8"))
            elif column in categorical_columns:
                sep = "<!SEP!>"
                categories = np.array(f[f"categories_{column}"].decode().split(sep))
                chunk[column] = categories[vals]
        return pd.DataFrame(chunk)
    df = pd.concat(map(to_df, iterable), ignore_index=True, sort=True)
    return df


setattr(pd, "read_root", read_root)


def to_root(
    df, filename, treename="t", chunksize=1e6, compression=uproot.LZ4(1), progress=False
):
    """
    Writes ROOT file containing one TTree with the input pandas DataFrame.

    filename: name of output file
    treename: name of output TTree
    chunksize: number of rows per basket
    compression: uproot compression object (LZ4, ZLIB, LZMA, or None)
    progress: show tqdm progress bar?
    """
    tree_dtypes = dict()
    string_branches = []
    category_branches = []
    for bname, dtype in df.dtypes.items():
        if (dtype == "object"):
            if type(df.iloc[0][bname]) in [str]:
                tree_dtypes[bname] = uproot.newbranch(np.dtype(">i2"), size=bname+"_strn", compression=uproot.ZLIB(6))
                string_branches.append(bname)
            else:
                raise Exception(f"Don't know what kind of branch {bname} is.")
        elif (str(dtype) == "category"):
            tree_dtypes[bname] = np.int8
            category_branches.append(bname)
        else:
            tree_dtypes[bname] = dtype
    with uproot.recreate(filename, compression=compression) as f:
        t = uproot.newtree(tree_dtypes)
        f[treename] = t
        for bname in category_branches:
            sep = "<!SEP!>"
            f[f"categories_{bname}"] = sep.join(df[bname].cat.categories)
        chunksize = int(chunksize)
        iterable = range(0, len(df), chunksize)
        if progress:
            from tqdm.auto import tqdm
            iterable = tqdm(iterable)
        for i in iterable:
            chunk = df.iloc[i : i + chunksize]
            basket = dict()
            for column in chunk.columns:
                if column in string_branches:
                    arr = chunk[column].values.astype("str")
                    import awkward
                    jagged = awkward.StringArray.fromnumpy(arr)._content
                    jagged = jagged[jagged != 0]
                    basket[column] = jagged
                    basket[column+"_strn"] = jagged.counts
                elif column in category_branches:
                    basket[column] = chunk[column].cat.codes.values.astype(np.int8)
                else:
                    basket[column] = chunk[column].values
            f[treename].extend(basket)


PandasObject.to_root = to_root

def iter_draw(path, varexp, sel="", treepath="t", progress=False, entrysteps="50MB", **kwargs):
    """
    Same as `tree_draw`, except this requires an additional first argument for the path/pattern
    of input root files. Tree name is specified via `treepath`.
    Iterates over the files in chunks of `entrysteps` (as per `uproot.iterate`), reading
    only the branches deemed necessary according to `find_variables`.
    """
    hists = []
    t0 = time.time()
    nevents = 0
    edges = None
    branches = find_variables(varexp + ":" + sel)
    iterable = enumerate(uproot.iterate(path,
                                  treepath="t",
                                  entrysteps=entrysteps,
                                  branches=branches,
                                  namedecode="ascii",
                                  outputtype=pd.DataFrame,
                                 ))
    if progress:
        from tqdm.auto import tqdm
        iterable = tqdm(iterable)
    for i,df in iterable:
        nevents += len(df)
        if i >= 1 and "bins" not in kwargs:
            kwargs["bins"] = edges
        h = df.draw(varexp, sel, **kwargs)
        edges = h.edges
        hists.append(h)
    h = sum(hists)
    t1 = time.time()
    if progress:
        print(f"Processed {nevents} in {t1-t0:.2f}s ({1e-6*nevents/(t1-t0):.2f}MHz)")
    return h


def find_variables(expr):
    """
    Given a string like "DV_x:DV_y:(lxy < DV_x+1) and (lxy>1)", returns a list of
    ["DV_x", "DV_y", "lxy"]
    (i.e., extracts what seem to be column names)
    """
    from io import BytesIO
    from tokenize import tokenize, NAME

    varnames = []
    g = list(tokenize(BytesIO(expr.encode("utf-8")).readline))
    for ix, x in enumerate(g):
        toknum = x[0]
        tokval = x[1]
        if toknum != NAME:
            continue
        if ix > 0 and g[ix - 1][1] in ["."]:
            continue
        if ix < len(g) - 1 and g[ix + 1][1] in [".", "("]:
            continue
        if tokval in ["and", "or"]:
            continue
        varnames.append(tokval)
    varnames = list(set(varnames))
    return varnames


def hacky_query_eval(df, varstr, selstr="", verbose=False):
    """
    Please don't read/use. This is dangerous and stupid, kind of like
    integrating a function by printing out a plot, coloring the area under it in red,
    faxing it to yourself, then counting red pixels to get the area.

    Basically I wanted some way to convert

        df.query("dimuon_mass > 5 and pass_baseline_iso").eval("dimuon_mass").mean()

    into

        df["dimuon_mass"][ (df["dimuon_mass"] > 5) & (df["pass_baseline_iso"]) ].mean()

    because the latter doesn't make an intermediate copy of all the columns with query(),
    and it also doesn't do jitting with numexpr. In principle, this is much faster to execute.

    Usage:

        arr = hacky_query_eval(
            df_data,
            varstr = "dimuon_mass",
            selstr = "pass_baseline_iso and 0<logabsetaphi<1.25",
        )
        print(arr.mean())
    """
    from pandas.core.computation.expr import Expr
    from pandas.core.computation.scope import Scope

    env = Scope(
        1, global_dict=globals(), local_dict=locals(), resolvers=[df], target=None,
    )

    def inject_df(s):
        """
        convert expression string like (a > 1) to (df["a"] > 1)
        so that it can be eval'd later
        """
        expr = Expr(s, env=env, parser="pandas")
        self = expr._visitor

        def visit_Name_hack(node, **kwargs):
            result = self.term_type(node.id, self.env, **kwargs)
            result._name = f'df["{result._name}"]'
            return result

        def _maybe_downcast_constants_hack(left, right):
            return left, right

        expr._visitor.visit_Name = visit_Name_hack
        expr._visitor._maybe_downcast_constants = _maybe_downcast_constants_hack
        expr.terms = expr.parse()
        return str(expr)

    varexpr = inject_df(varstr)
    toeval = f"({varexpr})"
    if selstr:
        selexpr = str(inject_df(selstr))
        toeval += f"[{selexpr}].values"
    if verbose:
        print(f"Evaluating string: {toeval}")
    result = eval(toeval)
    return result


if __name__ == "__main__":
    import numpy as np

    N = int(1e5)
    # make dataframe
    df = pd.DataFrame(
        dict(
            mass=np.random.normal(3.0, 0.1, N),
            foo=np.random.random(N),
            bar=np.random.random(N),
        )
    )

    # write out 3 columns to a root file
    df.to_root("test.root")

    # read back two columns
    df = pd.read_root("test.root", columns=["mass", "foo"])

    # make 1D histogram of `mass+0.1` for rows where `0.1<foo<0.2`
    # kwargs after first two required args are passed to yahist's Hist1D()
    df.draw("mass + 0.1", "0.1<foo<0.2", bins="200,0,10").plot(histtype="step")
	import time
	import numpy as np
	import pandas as pd
	import uproot
	from yahist import Hist1D, Hist2D
	from pandas.core.base import PandasObject
	from cachetools import cached, LRUCache
	from cachetools.keys import hashkey

	def myhasher(args, *kwargs):
	# pd.DataFrame disabled __hash__ because they are mutable
	hbins = kwargs.pop("bins", 0)
	if "ndarray" in str(type(hbins)):
	hbins = hash(tuple(hbins.tolist()))
	if type(hbins) in [list, tuple] and len(hbins) == 2:
	hbins = hash(map(tuple,hbins))
	return hashkey(args[0]._data, args[1:], hbins, **kwargs)
	@cached(cache=LRUCache(maxsize=256), key=myhasher)
	def tree_draw(df, varexp, sel="", **kwargs):
	from tokenize import tokenize, NAME, OP
	from io import BytesIO

	tokens = tokenize(BytesIO(f"{varexp}${sel}".encode("utf-8")).readline)
	nops = 0
	for x in tokens:
	toknum, tokval = x[:2]
	nops += (toknum == NAME) and (tokval in ["and", "or"])
	nops += toknum == OP
	fast = nops < 5
	twodim = ":" in varexp
	if fast and not twodim:
	raw = hacky_query_eval(df, varexp, sel)
	# performance hit for float16's. copy to float32 first.
	if raw.dtype in ["float16"]:
	vals = raw.astype("float32")
	else:
	vals = raw
	if "weights" in kwargs:
	raise Exception("`weights` kwarg not supported for `fast=True` (yet?)")
	return Hist1D(vals, **kwargs)
	if not sel:
	mask = slice(None)
	else:
	mask = df.eval(sel)
	if twodim:
	assert(varexp.count(":") == 1)
	varexp1, varexp2 = varexp.split(":")
	vals = np.c_[df[mask].eval(varexp1), df[mask].eval(varexp2)]
	return Hist2D(vals, **kwargs)
	else:
	vals = df[mask].eval(varexp)
	return Hist1D(vals, **kwargs)


	PandasObject.draw = tree_draw


	def read_root(filename, treename="t", columns=None, progress=False, **kwargs):
	"""
	Read ROOT file containing one TTree into pandas DataFrame.
	Thin wrapper around `uproot.iterate`.

	filename: filename(s)/file pattern(s)
	treename: name of input TTree
	progress: show tqdm progress bar?
	columns: list of columns ("branches") to read (default of `None` reads all)
	**kwargs: extra kwargs to pass to `uproot.iterate`

	Passing `entrysteps=[(0, 10)]` will read the first 10 rows, for example.
	"""
	# entrysteps of None iterates by basket to match `dataframe_to_ttree`
	iterable = uproot.iterate(
	filename,
	treename,
	columns,
	entrysteps=kwargs.pop("entrysteps", None),
	outputtype=dict,
	namedecode="ascii",
	**kwargs,
	)
	if progress:
	from tqdm.auto import tqdm

	iterable = tqdm(iterable)
	f = uproot.open(filename)
	categorical_columns = [k.decode().split("_",1)[1].rsplit(";",1)[0] for k in f.keys() if k.startswith(b"categories_")]
	def to_df(chunk):
	for column in list(chunk.keys()):
	vals = chunk[column]
	if (vals.dtype == "object") and (column+"_strn" in chunk.keys()):
	del chunk[column+"_strn"]
	import awkward
	chunk[column] = awkward.StringArray.fromjagged(vals.astype("uint8"))
	elif column in categorical_columns:
	sep = "<!SEP!>"
	categories = np.array(f[f"categories_{column}"].decode().split(sep))
	chunk[column] = categories[vals]
	return pd.DataFrame(chunk)
	df = pd.concat(map(to_df, iterable), ignore_index=True, sort=True)
	return df


	setattr(pd, "read_root", read_root)


	def to_root(
	df, filename, treename="t", chunksize=1e6, compression=uproot.LZ4(1), progress=False
	):
	"""
	Writes ROOT file containing one TTree with the input pandas DataFrame.

	filename: name of output file
	treename: name of output TTree
	chunksize: number of rows per basket
	compression: uproot compression object (LZ4, ZLIB, LZMA, or None)
	progress: show tqdm progress bar?
	"""
	tree_dtypes = dict()
	string_branches = []
	category_branches = []
	for bname, dtype in df.dtypes.items():
	if (dtype == "object"):
	if type(df.iloc[0][bname]) in [str]:
	tree_dtypes[bname] = uproot.newbranch(np.dtype(">i2"), size=bname+"_strn", compression=uproot.ZLIB(6))
	string_branches.append(bname)
	else:
	raise Exception(f"Don't know what kind of branch {bname} is.")
	elif (str(dtype) == "category"):
	tree_dtypes[bname] = np.int8
	category_branches.append(bname)
	else:
	tree_dtypes[bname] = dtype
	with uproot.recreate(filename, compression=compression) as f:
	t = uproot.newtree(tree_dtypes)
	f[treename] = t
	for bname in category_branches:
	sep = "<!SEP!>"
	f[f"categories_{bname}"] = sep.join(df[bname].cat.categories)
	chunksize = int(chunksize)
	iterable = range(0, len(df), chunksize)
	if progress:
	from tqdm.auto import tqdm
	iterable = tqdm(iterable)
	for i in iterable:
	chunk = df.iloc[i : i + chunksize]
	basket = dict()
	for column in chunk.columns:
	if column in string_branches:
	arr = chunk[column].values.astype("str")
	import awkward
	jagged = awkward.StringArray.fromnumpy(arr)._content
	jagged = jagged[jagged != 0]
	basket[column] = jagged
	basket[column+"_strn"] = jagged.counts
	elif column in category_branches:
	basket[column] = chunk[column].cat.codes.values.astype(np.int8)
	else:
	basket[column] = chunk[column].values
	f[treename].extend(basket)


	PandasObject.to_root = to_root

	def iter_draw(path, varexp, sel="", treepath="t", progress=False, entrysteps="50MB", **kwargs):
	"""
	Same as `tree_draw`, except this requires an additional first argument for the path/pattern
	of input root files. Tree name is specified via `treepath`.
	Iterates over the files in chunks of `entrysteps` (as per `uproot.iterate`), reading
	only the branches deemed necessary according to `find_variables`.
	"""
	hists = []
	t0 = time.time()
	nevents = 0
	edges = None
	branches = find_variables(varexp + ":" + sel)
	iterable = enumerate(uproot.iterate(path,
	treepath="t",
	entrysteps=entrysteps,
	branches=branches,
	namedecode="ascii",
	outputtype=pd.DataFrame,
	))
	if progress:
	from tqdm.auto import tqdm
	iterable = tqdm(iterable)
	for i,df in iterable:
	nevents += len(df)
	if i >= 1 and "bins" not in kwargs:
	kwargs["bins"] = edges
	h = df.draw(varexp, sel, **kwargs)
	edges = h.edges
	hists.append(h)
	h = sum(hists)
	t1 = time.time()
	if progress:
	print(f"Processed {nevents} in {t1-t0:.2f}s ({1e-6*nevents/(t1-t0):.2f}MHz)")
	return h


	def find_variables(expr):
	"""
	Given a string like "DV_x:DV_y:(lxy < DV_x+1) and (lxy>1)", returns a list of
	["DV_x", "DV_y", "lxy"]
	(i.e., extracts what seem to be column names)
	"""
	from io import BytesIO
	from tokenize import tokenize, NAME

	varnames = []
	g = list(tokenize(BytesIO(expr.encode("utf-8")).readline))
	for ix, x in enumerate(g):
	toknum = x[0]
	tokval = x[1]
	if toknum != NAME:
	continue
	if ix > 0 and g[ix - 1][1] in ["."]:
	continue
	if ix < len(g) - 1 and g[ix + 1][1] in [".", "("]:
	continue
	if tokval in ["and", "or"]:
	continue
	varnames.append(tokval)
	varnames = list(set(varnames))
	return varnames


	def hacky_query_eval(df, varstr, selstr="", verbose=False):
	"""
	Please don't read/use. This is dangerous and stupid, kind of like
	integrating a function by printing out a plot, coloring the area under it in red,
	faxing it to yourself, then counting red pixels to get the area.

	Basically I wanted some way to convert

	df.query("dimuon_mass > 5 and pass_baseline_iso").eval("dimuon_mass").mean()

	into

	df["dimuon_mass"][ (df["dimuon_mass"] > 5) & (df["pass_baseline_iso"]) ].mean()

	because the latter doesn't make an intermediate copy of all the columns with query(),
	and it also doesn't do jitting with numexpr. In principle, this is much faster to execute.

	Usage:

	arr = hacky_query_eval(
	df_data,
	varstr = "dimuon_mass",
	selstr = "pass_baseline_iso and 0<logabsetaphi<1.25",
	)
	print(arr.mean())
	"""
	from pandas.core.computation.expr import Expr
	from pandas.core.computation.scope import Scope

	env = Scope(
	1, global_dict=globals(), local_dict=locals(), resolvers=[df], target=None,
	)

	def inject_df(s):
	"""
	convert expression string like (a > 1) to (df["a"] > 1)
	so that it can be eval'd later
	"""
	expr = Expr(s, env=env, parser="pandas")
	self = expr._visitor

	def visit_Name_hack(node, **kwargs):
	result = self.term_type(node.id, self.env, **kwargs)
	result._name = f'df["{result._name}"]'
	return result

	def _maybe_downcast_constants_hack(left, right):
	return left, right

	expr._visitor.visit_Name = visit_Name_hack
	expr._visitor._maybe_downcast_constants = _maybe_downcast_constants_hack
	expr.terms = expr.parse()
	return str(expr)

	varexpr = inject_df(varstr)
	toeval = f"({varexpr})"
	if selstr:
	selexpr = str(inject_df(selstr))
	toeval += f"[{selexpr}].values"
	if verbose:
	print(f"Evaluating string: {toeval}")
	result = eval(toeval)
	return result


	if __name__ == "__main__":
	import numpy as np

	N = int(1e5)
	# make dataframe
	df = pd.DataFrame(
	dict(
	mass=np.random.normal(3.0, 0.1, N),
	foo=np.random.random(N),
	bar=np.random.random(N),
	)
	)

	# write out 3 columns to a root file
	df.to_root("test.root")

	# read back two columns
	df = pd.read_root("test.root", columns=["mass", "foo"])

	# make 1D histogram of `mass+0.1` for rows where `0.1<foo<0.2`
	# kwargs after first two required args are passed to yahist's Hist1D()
	df.draw("mass + 0.1", "0.1<foo<0.2", bins="200,0,10").plot(histtype="step")