Skip to content

Instantly share code, notes, and snippets.

@aminnj
Last active November 22, 2020 05:37
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aminnj/7d27376886a2a5414b61841aa7457e8a to your computer and use it in GitHub Desktop.
Save aminnj/7d27376886a2a5414b61841aa7457e8a to your computer and use it in GitHub Desktop.
utilities for pandas + ROOT (via monkeypatching pandas)
import time
import numpy as np
import pandas as pd
import uproot
from yahist import Hist1D, Hist2D
from pandas.core.base import PandasObject
from cachetools import cached, LRUCache
from cachetools.keys import hashkey
def myhasher(*args, **kwargs):
# pd.DataFrame disabled __hash__ because they are mutable
hbins = kwargs.pop("bins", 0)
if "ndarray" in str(type(hbins)):
hbins = hash(tuple(hbins.tolist()))
if type(hbins) in [list, tuple] and len(hbins) == 2:
hbins = hash(map(tuple,hbins))
return hashkey(args[0]._data, args[1:], hbins, **kwargs)
@cached(cache=LRUCache(maxsize=256), key=myhasher)
def tree_draw(df, varexp, sel="", **kwargs):
from tokenize import tokenize, NAME, OP
from io import BytesIO
tokens = tokenize(BytesIO(f"{varexp}${sel}".encode("utf-8")).readline)
nops = 0
for x in tokens:
toknum, tokval = x[:2]
nops += (toknum == NAME) and (tokval in ["and", "or"])
nops += toknum == OP
fast = nops < 5
twodim = ":" in varexp
if fast and not twodim:
raw = hacky_query_eval(df, varexp, sel)
# performance hit for float16's. copy to float32 first.
if raw.dtype in ["float16"]:
vals = raw.astype("float32")
else:
vals = raw
if "weights" in kwargs:
raise Exception("`weights` kwarg not supported for `fast=True` (yet?)")
return Hist1D(vals, **kwargs)
if not sel:
mask = slice(None)
else:
mask = df.eval(sel)
if twodim:
assert(varexp.count(":") == 1)
varexp1, varexp2 = varexp.split(":")
vals = np.c_[df[mask].eval(varexp1), df[mask].eval(varexp2)]
return Hist2D(vals, **kwargs)
else:
vals = df[mask].eval(varexp)
return Hist1D(vals, **kwargs)
PandasObject.draw = tree_draw
def read_root(filename, treename="t", columns=None, progress=False, **kwargs):
"""
Read ROOT file containing one TTree into pandas DataFrame.
Thin wrapper around `uproot.iterate`.
filename: filename(s)/file pattern(s)
treename: name of input TTree
progress: show tqdm progress bar?
columns: list of columns ("branches") to read (default of `None` reads all)
**kwargs: extra kwargs to pass to `uproot.iterate`
Passing `entrysteps=[(0, 10)]` will read the first 10 rows, for example.
"""
# entrysteps of None iterates by basket to match `dataframe_to_ttree`
iterable = uproot.iterate(
filename,
treename,
columns,
entrysteps=kwargs.pop("entrysteps", None),
outputtype=dict,
namedecode="ascii",
**kwargs,
)
if progress:
from tqdm.auto import tqdm
iterable = tqdm(iterable)
f = uproot.open(filename)
categorical_columns = [k.decode().split("_",1)[1].rsplit(";",1)[0] for k in f.keys() if k.startswith(b"categories_")]
def to_df(chunk):
for column in list(chunk.keys()):
vals = chunk[column]
if (vals.dtype == "object") and (column+"_strn" in chunk.keys()):
del chunk[column+"_strn"]
import awkward
chunk[column] = awkward.StringArray.fromjagged(vals.astype("uint8"))
elif column in categorical_columns:
sep = "<!SEP!>"
categories = np.array(f[f"categories_{column}"].decode().split(sep))
chunk[column] = categories[vals]
return pd.DataFrame(chunk)
df = pd.concat(map(to_df, iterable), ignore_index=True, sort=True)
return df
setattr(pd, "read_root", read_root)
def to_root(
df, filename, treename="t", chunksize=1e6, compression=uproot.LZ4(1), progress=False
):
"""
Writes ROOT file containing one TTree with the input pandas DataFrame.
filename: name of output file
treename: name of output TTree
chunksize: number of rows per basket
compression: uproot compression object (LZ4, ZLIB, LZMA, or None)
progress: show tqdm progress bar?
"""
tree_dtypes = dict()
string_branches = []
category_branches = []
for bname, dtype in df.dtypes.items():
if (dtype == "object"):
if type(df.iloc[0][bname]) in [str]:
tree_dtypes[bname] = uproot.newbranch(np.dtype(">i2"), size=bname+"_strn", compression=uproot.ZLIB(6))
string_branches.append(bname)
else:
raise Exception(f"Don't know what kind of branch {bname} is.")
elif (str(dtype) == "category"):
tree_dtypes[bname] = np.int8
category_branches.append(bname)
else:
tree_dtypes[bname] = dtype
with uproot.recreate(filename, compression=compression) as f:
t = uproot.newtree(tree_dtypes)
f[treename] = t
for bname in category_branches:
sep = "<!SEP!>"
f[f"categories_{bname}"] = sep.join(df[bname].cat.categories)
chunksize = int(chunksize)
iterable = range(0, len(df), chunksize)
if progress:
from tqdm.auto import tqdm
iterable = tqdm(iterable)
for i in iterable:
chunk = df.iloc[i : i + chunksize]
basket = dict()
for column in chunk.columns:
if column in string_branches:
arr = chunk[column].values.astype("str")
import awkward
jagged = awkward.StringArray.fromnumpy(arr)._content
jagged = jagged[jagged != 0]
basket[column] = jagged
basket[column+"_strn"] = jagged.counts
elif column in category_branches:
basket[column] = chunk[column].cat.codes.values.astype(np.int8)
else:
basket[column] = chunk[column].values
f[treename].extend(basket)
PandasObject.to_root = to_root
def iter_draw(path, varexp, sel="", treepath="t", progress=False, entrysteps="50MB", **kwargs):
"""
Same as `tree_draw`, except this requires an additional first argument for the path/pattern
of input root files. Tree name is specified via `treepath`.
Iterates over the files in chunks of `entrysteps` (as per `uproot.iterate`), reading
only the branches deemed necessary according to `find_variables`.
"""
hists = []
t0 = time.time()
nevents = 0
edges = None
branches = find_variables(varexp + ":" + sel)
iterable = enumerate(uproot.iterate(path,
treepath="t",
entrysteps=entrysteps,
branches=branches,
namedecode="ascii",
outputtype=pd.DataFrame,
))
if progress:
from tqdm.auto import tqdm
iterable = tqdm(iterable)
for i,df in iterable:
nevents += len(df)
if i >= 1 and "bins" not in kwargs:
kwargs["bins"] = edges
h = df.draw(varexp, sel, **kwargs)
edges = h.edges
hists.append(h)
h = sum(hists)
t1 = time.time()
if progress:
print(f"Processed {nevents} in {t1-t0:.2f}s ({1e-6*nevents/(t1-t0):.2f}MHz)")
return h
def find_variables(expr):
"""
Given a string like "DV_x:DV_y:(lxy < DV_x+1) and (lxy>1)", returns a list of
["DV_x", "DV_y", "lxy"]
(i.e., extracts what seem to be column names)
"""
from io import BytesIO
from tokenize import tokenize, NAME
varnames = []
g = list(tokenize(BytesIO(expr.encode("utf-8")).readline))
for ix, x in enumerate(g):
toknum = x[0]
tokval = x[1]
if toknum != NAME:
continue
if ix > 0 and g[ix - 1][1] in ["."]:
continue
if ix < len(g) - 1 and g[ix + 1][1] in [".", "("]:
continue
if tokval in ["and", "or"]:
continue
varnames.append(tokval)
varnames = list(set(varnames))
return varnames
def hacky_query_eval(df, varstr, selstr="", verbose=False):
"""
Please don't read/use. This is dangerous and stupid, kind of like
integrating a function by printing out a plot, coloring the area under it in red,
faxing it to yourself, then counting red pixels to get the area.
Basically I wanted some way to convert
df.query("dimuon_mass > 5 and pass_baseline_iso").eval("dimuon_mass").mean()
into
df["dimuon_mass"][ (df["dimuon_mass"] > 5) & (df["pass_baseline_iso"]) ].mean()
because the latter doesn't make an intermediate copy of all the columns with query(),
and it also doesn't do jitting with numexpr. In principle, this is much faster to execute.
Usage:
arr = hacky_query_eval(
df_data,
varstr = "dimuon_mass",
selstr = "pass_baseline_iso and 0<logabsetaphi<1.25",
)
print(arr.mean())
"""
from pandas.core.computation.expr import Expr
from pandas.core.computation.scope import Scope
env = Scope(
1, global_dict=globals(), local_dict=locals(), resolvers=[df], target=None,
)
def inject_df(s):
"""
convert expression string like (a > 1) to (df["a"] > 1)
so that it can be eval'd later
"""
expr = Expr(s, env=env, parser="pandas")
self = expr._visitor
def visit_Name_hack(node, **kwargs):
result = self.term_type(node.id, self.env, **kwargs)
result._name = f'df["{result._name}"]'
return result
def _maybe_downcast_constants_hack(left, right):
return left, right
expr._visitor.visit_Name = visit_Name_hack
expr._visitor._maybe_downcast_constants = _maybe_downcast_constants_hack
expr.terms = expr.parse()
return str(expr)
varexpr = inject_df(varstr)
toeval = f"({varexpr})"
if selstr:
selexpr = str(inject_df(selstr))
toeval += f"[{selexpr}].values"
if verbose:
print(f"Evaluating string: {toeval}")
result = eval(toeval)
return result
if __name__ == "__main__":
import numpy as np
N = int(1e5)
# make dataframe
df = pd.DataFrame(
dict(
mass=np.random.normal(3.0, 0.1, N),
foo=np.random.random(N),
bar=np.random.random(N),
)
)
# write out 3 columns to a root file
df.to_root("test.root")
# read back two columns
df = pd.read_root("test.root", columns=["mass", "foo"])
# make 1D histogram of `mass+0.1` for rows where `0.1<foo<0.2`
# kwargs after first two required args are passed to yahist's Hist1D()
df.draw("mass + 0.1", "0.1<foo<0.2", bins="200,0,10").plot(histtype="step")
@aminnj
Copy link
Author

aminnj commented Nov 18, 2020

Usage:

pip install pandas uproot awkward yahist
curl -s -o heputils.py https://gist.githubusercontent.com/aminnj/7d27376886a2a5414b61841aa7457e8a/raw

then

import heputils

...will add some functions to pandas namespace and for dataframes:

Write out dataframes to a ROOT file (including strings, though these are slow and you should .astype("category") if possible):

import numpy as np
N = int(1e5)
df = pd.DataFrame(dict(mass=np.random.normal(3.0, 0.1, N), foo=np.random.random(N), bar=np.random.random(N)))
df.to_root("test.root")

Read ROOT files and specify certain columns and/or a range of rows. Let's just say that ROOT is the original parquet (just with different words: rows -> entries/events, rowgroups -> baskets, columns -> branches).

df = pd.read_root("test*.root", columns=["mass", "foo"])

And, for those familiar with TTree::Draw(), you can draw directly from a dataframe. This will make a 1D histogram of mass+0.1 for rows where 0.1<foo<0.2. All kwargs after first two required args are passed to yahist's Hist1D(). See pip install yahist for details on the (nice and simple) histogram object.

df.draw("mass+0.1", "0.1<foo<0.2", bins="200,0,10").plot(histtype="step")

or 2D with "x:y"

df.draw("mass:foo+1", "0.1<foo<0.2").plot(logz=True)

Lastly, this will read root files and make a histogram in one pass (chunked reading and only branches that are needed)

heputils.iter_draw("*.root", "mass", "(foo>0.2)", bins="200,0,10")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment