riga/nano_unique.py

## nano_unique.py
#!/usr/bin/env python3
# coding: utf-8

"""
Script that merges the events tree of two NanoAOD files, removing duplicates identified
by event number, run number and luminosity block.

> nano_unique.py in1.root in2.root out.root

NOTE: This is just a first draft whose performance could surely be improved
      in case there is an option to skip deserializing all branches with uproot
      but still being able to save them in a second file.
"""

from __future__ import annotations

import os
import math
from functools import partial
from typing import Any

import numpy as np
import awkward as ak
import uproot

try:
    import tqdm
    HAS_TQDM = True
except ImportError:
    HAS_TQDM = False


def nano_unique(
    input_path1: str,
    input_path2: str,
    output_path: str,
    tree_name: str = "Events",
    keep_branches: Any | None = None,
    step_size: int = 100000,
    verbose: bool = False,
) -> tuple[int, int]:
    """
    Joins two NanoAOD files located at *input_path1* and *input_path2*, removes duplicates
    identified by the (event, run, luminosityBlock) triplet, and saves the joined file at
    *output_path*. The output file will only contain a tree named *tree_name*, i.e., any other
    objects contained in one of the input files are dropped. In case a file already exists at
    *output_path*, it is removed first. Please note that events contained in *input_path1* are
    priotized in case a duplicate is detected.

    The output file is filled in chunks with a certain *step_size*, each one resulting in a new
    basket in the output file. It is recommended to choose this value as large as possible (
    depending on the available memory), to speed up the merging process but also to create files
    that are faster to read. *keep_branches* is forwarded as *filter_name* to
    :py:meth:`uproot.TTree.iterate` to select which branches to keep. If set, the three index
    branches (event, run, luminosityBlock) should be accepted. For more info, see this
    `link <https://uproot.readthedocs.io/en/latest/uproot.behaviors.TTree.TTree.html#arrays>`__.

    The number of written and overlapping events is returned in a 2-tuple.
    """
    # expand variables
    expand = lambda path: os.path.abspath(os.path.expandvars(os.path.expanduser(path)))
    input_path1 = expand(input_path1)
    input_path2 = expand(input_path2)
    output_path = expand(output_path)

    # prepare the output
    output_dir = os.path.dirname(output_path)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    elif os.path.exists(output_path):
        os.remove(output_path)
    output_file = uproot.create(output_path)

    # get input trees
    tree1 = uproot.open(input_path1)[tree_name]
    tree2 = uproot.open(input_path2)[tree_name]

    # read index columns over the full reference file
    index_columns = ["event", "run", "luminosityBlock"]
    index1 = tree1.arrays(index_columns)

    # prepare counts
    n_written = 0
    n_overlap = 0

    # iteration helper
    def iterate(tree, name):
        if verbose:
            print(f"iterating through {name} tree with {tree.num_entries} events")

        progress = (
            partial(tqdm.tqdm, total=int(math.ceil(tree.num_entries / step_size)))
            if verbose and HAS_TQDM else
            (lambda gen: gen)
        )

        return progress(tree.iterate(step_size=step_size, filter_name=keep_branches))

    # fill chunks of the first tree
    # note: if there was a way to properly "update" and extend existing trees with uproot, one
    # could just copy input_path1 to output_path first and skip this first loop
    for chunk1 in iterate(tree1, "first"):
        # update counts
        n_written += len(chunk1)

        # save or extend the tree
        # workaround: according to the uproot docs, it should be possible to just assign a flat
        #             awkward array to an output file to create a tree; however, it seems like
        #             variable length arrays, although having a standard type (e.g. "var * float32")
        #             are not properly accepted; an issue will be opened for this
        chunk1 = dict(zip(chunk1.fields, ak.unzip(chunk1)))
        # end of workaround
        if tree_name in output_file:
            output_file[tree_name].extend(chunk1)
        else:
            output_file[tree_name] = chunk1

    # fill chunks of the second tree
    for chunk2 in iterate(tree2, "second"):
        # determine a mask of events in tree2 that are also in tree1
        mask2 = np.isin(chunk2[index_columns], index1, assume_unique=True)
        chunk2 = chunk2[~mask2]

        # update counts
        n_written += len(chunk2)
        n_overlap += ak.sum(mask2)

        # skip the chunk if all events are overlapping
        if ak.all(mask2):
            continue

        # save or extend the tree
        # workaround: same as above
        chunk2 = dict(zip(chunk2.fields, ak.unzip(chunk2)))
        # end of workaround
        output_file[tree_name].extend(chunk2)

    if verbose:
        print(f"written {n_written} and found {n_overlap} overlapping event(s)")

    return n_written, n_overlap


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        description="joins two NanoAOD files and removes duplicate events",
    )
    parser.add_argument(
        "file1",
        help="path to the first file",
    )
    parser.add_argument(
        "file2",
        help="path to the second file",
    )
    parser.add_argument(
        "output",
        help="path to the output file to be created",
    )
    parser.add_argument(
        "--tree",
        "-t",
        default="Events",
        help="name of the trees to merge and create; default: Events",
    )
    parser.add_argument(
        "--step-size",
        "-s",
        type=int,
        default=100000,
        help="step size for iterations; default: 100000",
    )
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        help="verbose output, potentially with tqdm if installed",
    )
    args = parser.parse_args()

    nano_unique(
        input_path1=args.file1,
        input_path2=args.file2,
        output_path=args.output,
        tree_name=args.tree,
        step_size=args.step_size,
        verbose=args.verbose,
    )
	#!/usr/bin/env python3
	# coding: utf-8

	"""
	Script that merges the events tree of two NanoAOD files, removing duplicates identified
	by event number, run number and luminosity block.

	> nano_unique.py in1.root in2.root out.root

	NOTE: This is just a first draft whose performance could surely be improved
	in case there is an option to skip deserializing all branches with uproot
	but still being able to save them in a second file.
	"""

	from __future__ import annotations

	import os
	import math
	from functools import partial
	from typing import Any

	import numpy as np
	import awkward as ak
	import uproot

	try:
	import tqdm
	HAS_TQDM = True
	except ImportError:
	HAS_TQDM = False


	def nano_unique(
	input_path1: str,
	input_path2: str,
	output_path: str,
	tree_name: str = "Events",
	keep_branches: Any \| None = None,
	step_size: int = 100000,
	verbose: bool = False,
	) -> tuple[int, int]:
	"""
	Joins two NanoAOD files located at input_path1 and input_path2, removes duplicates
	identified by the (event, run, luminosityBlock) triplet, and saves the joined file at
	output_path. The output file will only contain a tree named tree_name, i.e., any other
	objects contained in one of the input files are dropped. In case a file already exists at
	output_path, it is removed first. Please note that events contained in input_path1 are
	priotized in case a duplicate is detected.

	The output file is filled in chunks with a certain step_size, each one resulting in a new
	basket in the output file. It is recommended to choose this value as large as possible (
	depending on the available memory), to speed up the merging process but also to create files
	that are faster to read. keep_branches is forwarded as filter_name to
	:py:meth:`uproot.TTree.iterate` to select which branches to keep. If set, the three index
	branches (event, run, luminosityBlock) should be accepted. For more info, see this
	`link <https://uproot.readthedocs.io/en/latest/uproot.behaviors.TTree.TTree.html#arrays>`__.

	The number of written and overlapping events is returned in a 2-tuple.
	"""
	# expand variables
	expand = lambda path: os.path.abspath(os.path.expandvars(os.path.expanduser(path)))
	input_path1 = expand(input_path1)
	input_path2 = expand(input_path2)
	output_path = expand(output_path)

	# prepare the output
	output_dir = os.path.dirname(output_path)
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)
	elif os.path.exists(output_path):
	os.remove(output_path)
	output_file = uproot.create(output_path)

	# get input trees
	tree1 = uproot.open(input_path1)[tree_name]
	tree2 = uproot.open(input_path2)[tree_name]

	# read index columns over the full reference file
	index_columns = ["event", "run", "luminosityBlock"]
	index1 = tree1.arrays(index_columns)

	# prepare counts
	n_written = 0
	n_overlap = 0

	# iteration helper
	def iterate(tree, name):
	if verbose:
	print(f"iterating through {name} tree with {tree.num_entries} events")

	progress = (
	partial(tqdm.tqdm, total=int(math.ceil(tree.num_entries / step_size)))
	if verbose and HAS_TQDM else
	(lambda gen: gen)
	)

	return progress(tree.iterate(step_size=step_size, filter_name=keep_branches))

	# fill chunks of the first tree
	# note: if there was a way to properly "update" and extend existing trees with uproot, one
	# could just copy input_path1 to output_path first and skip this first loop
	for chunk1 in iterate(tree1, "first"):
	# update counts
	n_written += len(chunk1)

	# save or extend the tree
	# workaround: according to the uproot docs, it should be possible to just assign a flat
	# awkward array to an output file to create a tree; however, it seems like
	# variable length arrays, although having a standard type (e.g. "var * float32")
	# are not properly accepted; an issue will be opened for this
	chunk1 = dict(zip(chunk1.fields, ak.unzip(chunk1)))
	# end of workaround
	if tree_name in output_file:
	output_file[tree_name].extend(chunk1)
	else:
	output_file[tree_name] = chunk1

	# fill chunks of the second tree
	for chunk2 in iterate(tree2, "second"):
	# determine a mask of events in tree2 that are also in tree1
	mask2 = np.isin(chunk2[index_columns], index1, assume_unique=True)
	chunk2 = chunk2[~mask2]

	# update counts
	n_written += len(chunk2)
	n_overlap += ak.sum(mask2)

	# skip the chunk if all events are overlapping
	if ak.all(mask2):
	continue

	# save or extend the tree
	# workaround: same as above
	chunk2 = dict(zip(chunk2.fields, ak.unzip(chunk2)))
	# end of workaround
	output_file[tree_name].extend(chunk2)

	if verbose:
	print(f"written {n_written} and found {n_overlap} overlapping event(s)")

	return n_written, n_overlap


	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(
	description="joins two NanoAOD files and removes duplicate events",
	)
	parser.add_argument(
	"file1",
	help="path to the first file",
	)
	parser.add_argument(
	"file2",
	help="path to the second file",
	)
	parser.add_argument(
	"output",
	help="path to the output file to be created",
	)
	parser.add_argument(
	"--tree",
	"-t",
	default="Events",
	help="name of the trees to merge and create; default: Events",
	)
	parser.add_argument(
	"--step-size",
	"-s",
	type=int,
	default=100000,
	help="step size for iterations; default: 100000",
	)
	parser.add_argument(
	"--verbose",
	"-v",
	action="store_true",
	help="verbose output, potentially with tqdm if installed",
	)
	args = parser.parse_args()

	nano_unique(
	input_path1=args.file1,
	input_path2=args.file2,
	output_path=args.output,
	tree_name=args.tree,
	step_size=args.step_size,
	verbose=args.verbose,
	)