Skip to content

Instantly share code, notes, and snippets.

@riga
Created July 5, 2023 09:18
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save riga/157625f7323e529a60d83ef5bec68c1d to your computer and use it in GitHub Desktop.
Save riga/157625f7323e529a60d83ef5bec68c1d to your computer and use it in GitHub Desktop.
Merge two NanoAOD files while removing duplicate events
#!/usr/bin/env python3
# coding: utf-8
"""
Script that merges the events tree of two NanoAOD files, removing duplicates identified
by event number, run number and luminosity block.
> nano_unique.py in1.root in2.root out.root
NOTE: This is just a first draft whose performance could surely be improved
in case there is an option to skip deserializing all branches with uproot
but still being able to save them in a second file.
"""
from __future__ import annotations
import os
import math
from functools import partial
from typing import Any
import numpy as np
import awkward as ak
import uproot
try:
import tqdm
HAS_TQDM = True
except ImportError:
HAS_TQDM = False
def nano_unique(
input_path1: str,
input_path2: str,
output_path: str,
tree_name: str = "Events",
keep_branches: Any | None = None,
step_size: int = 100000,
verbose: bool = False,
) -> tuple[int, int]:
"""
Joins two NanoAOD files located at *input_path1* and *input_path2*, removes duplicates
identified by the (event, run, luminosityBlock) triplet, and saves the joined file at
*output_path*. The output file will only contain a tree named *tree_name*, i.e., any other
objects contained in one of the input files are dropped. In case a file already exists at
*output_path*, it is removed first. Please note that events contained in *input_path1* are
priotized in case a duplicate is detected.
The output file is filled in chunks with a certain *step_size*, each one resulting in a new
basket in the output file. It is recommended to choose this value as large as possible (
depending on the available memory), to speed up the merging process but also to create files
that are faster to read. *keep_branches* is forwarded as *filter_name* to
:py:meth:`uproot.TTree.iterate` to select which branches to keep. If set, the three index
branches (event, run, luminosityBlock) should be accepted. For more info, see this
`link <https://uproot.readthedocs.io/en/latest/uproot.behaviors.TTree.TTree.html#arrays>`__.
The number of written and overlapping events is returned in a 2-tuple.
"""
# expand variables
expand = lambda path: os.path.abspath(os.path.expandvars(os.path.expanduser(path)))
input_path1 = expand(input_path1)
input_path2 = expand(input_path2)
output_path = expand(output_path)
# prepare the output
output_dir = os.path.dirname(output_path)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
elif os.path.exists(output_path):
os.remove(output_path)
output_file = uproot.create(output_path)
# get input trees
tree1 = uproot.open(input_path1)[tree_name]
tree2 = uproot.open(input_path2)[tree_name]
# read index columns over the full reference file
index_columns = ["event", "run", "luminosityBlock"]
index1 = tree1.arrays(index_columns)
# prepare counts
n_written = 0
n_overlap = 0
# iteration helper
def iterate(tree, name):
if verbose:
print(f"iterating through {name} tree with {tree.num_entries} events")
progress = (
partial(tqdm.tqdm, total=int(math.ceil(tree.num_entries / step_size)))
if verbose and HAS_TQDM else
(lambda gen: gen)
)
return progress(tree.iterate(step_size=step_size, filter_name=keep_branches))
# fill chunks of the first tree
# note: if there was a way to properly "update" and extend existing trees with uproot, one
# could just copy input_path1 to output_path first and skip this first loop
for chunk1 in iterate(tree1, "first"):
# update counts
n_written += len(chunk1)
# save or extend the tree
# workaround: according to the uproot docs, it should be possible to just assign a flat
# awkward array to an output file to create a tree; however, it seems like
# variable length arrays, although having a standard type (e.g. "var * float32")
# are not properly accepted; an issue will be opened for this
chunk1 = dict(zip(chunk1.fields, ak.unzip(chunk1)))
# end of workaround
if tree_name in output_file:
output_file[tree_name].extend(chunk1)
else:
output_file[tree_name] = chunk1
# fill chunks of the second tree
for chunk2 in iterate(tree2, "second"):
# determine a mask of events in tree2 that are also in tree1
mask2 = np.isin(chunk2[index_columns], index1, assume_unique=True)
chunk2 = chunk2[~mask2]
# update counts
n_written += len(chunk2)
n_overlap += ak.sum(mask2)
# skip the chunk if all events are overlapping
if ak.all(mask2):
continue
# save or extend the tree
# workaround: same as above
chunk2 = dict(zip(chunk2.fields, ak.unzip(chunk2)))
# end of workaround
output_file[tree_name].extend(chunk2)
if verbose:
print(f"written {n_written} and found {n_overlap} overlapping event(s)")
return n_written, n_overlap
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="joins two NanoAOD files and removes duplicate events",
)
parser.add_argument(
"file1",
help="path to the first file",
)
parser.add_argument(
"file2",
help="path to the second file",
)
parser.add_argument(
"output",
help="path to the output file to be created",
)
parser.add_argument(
"--tree",
"-t",
default="Events",
help="name of the trees to merge and create; default: Events",
)
parser.add_argument(
"--step-size",
"-s",
type=int,
default=100000,
help="step size for iterations; default: 100000",
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="verbose output, potentially with tqdm if installed",
)
args = parser.parse_args()
nano_unique(
input_path1=args.file1,
input_path2=args.file2,
output_path=args.output,
tree_name=args.tree,
step_size=args.step_size,
verbose=args.verbose,
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment