lgray/nanoaod2parquet.py

## nanoaod2parquet.py
import pyarrow as pa
import pyarrow.parquet as pq

def nanoaod2arrowtable(params):
    """
    takes as input a (list of) root file(s) of ~flat ntuples
    and convert into a single arrow table
    """

    random.seed(None)

    inputFilesNames=[params[0]]
    outputDataSet=params[1]

    key_groups = {}
    key_groups_made = False
    n_events_current = 0
    n_processed_files = 0

    if not isinstance(inputFilesNames,list):
        fileList=[]
        fileList.append(inputFilesNames)
        inputFilesNames = fileList

    assert(len(inputFilesNames) == 1)

    out={}
    for f in inputFilesNames:
        #print("== B2P: Processing file %s"%f)
        with uproot.open(f) as rootf:
            if len(list(rootf.keys())) == 0 or "Events" not in rootf:
                return n_processed_files
            events = rootf["Events"]
            n_events_current += events.numentries

            the_data = []
            the_names = []
            dupes = {}
            for keyname in events.keys():
                array = events.arrays([keyname])[keyname]
                outname = keyname.decode()
                if outname in the_names:
                    if outname not in dupes.keys():
                        dupes[outname] = 0
                    dupes[outname] += 1
                    outname = outname + '_' + str(dupes[outname])
                the_names.append(outname)

                if isinstance(array,np.ndarray):
                    arrshape = array.shape
                    if len(arrshape) == 1:
                        the_data.append(pa.array(array))
                    else:
                        nevents = arrshape[0]
                        nbits = arrshape[1]
                        counts = np.full(nevents,nbits)
                        temp = awk.JaggedArray.fromcounts(counts,array.flatten())
                        the_data.append(pa.array(temp))
                elif isinstance(array,awk.JaggedArray):
                    contshape = array.content.shape
                    if len(contshape) == 1:
                        the_data.append(pa.array(array))
                    else:
                        nobject = contshape[0]
                        nbits = contshape[1]
                        bitcounts = np.full(nobject,nbits)
                        inner = awk.JaggedArray.fromcounts(bitcounts,array.content.flatten())
                        outer = awk.JaggedArray.fromcounts(array.counts,inner)
                        the_data.append(pa.ListArray.from_arrays(outer.offsets,outer.flatten()))
                elif isinstance(array,awk.ObjectArray):
                    #this only deals with one nested layer of jaggedness (sufficient for now)
                    temp = awkward.fromiter(array)
                    the_data.append(pa.ListArray.from_arrays(temp.offsets,temp.flatten()))
                del array

            tbl = pa.Table.from_arrays(arrays=the_data,names=the_names)

            n_processed_files += 1

    return tbl, n_processed_files
	import pyarrow as pa
	import pyarrow.parquet as pq

	def nanoaod2arrowtable(params):
	"""
	takes as input a (list of) root file(s) of ~flat ntuples
	and convert into a single arrow table
	"""

	random.seed(None)

	inputFilesNames=[params[0]]
	outputDataSet=params[1]

	key_groups = {}
	key_groups_made = False
	n_events_current = 0
	n_processed_files = 0

	if not isinstance(inputFilesNames,list):
	fileList=[]
	fileList.append(inputFilesNames)
	inputFilesNames = fileList

	assert(len(inputFilesNames) == 1)

	out={}
	for f in inputFilesNames:
	#print("== B2P: Processing file %s"%f)
	with uproot.open(f) as rootf:
	if len(list(rootf.keys())) == 0 or "Events" not in rootf:
	return n_processed_files
	events = rootf["Events"]
	n_events_current += events.numentries

	the_data = []
	the_names = []
	dupes = {}
	for keyname in events.keys():
	array = events.arrays([keyname])[keyname]
	outname = keyname.decode()
	if outname in the_names:
	if outname not in dupes.keys():
	dupes[outname] = 0
	dupes[outname] += 1
	outname = outname + '_' + str(dupes[outname])
	the_names.append(outname)

	if isinstance(array,np.ndarray):
	arrshape = array.shape
	if len(arrshape) == 1:
	the_data.append(pa.array(array))
	else:
	nevents = arrshape[0]
	nbits = arrshape[1]
	counts = np.full(nevents,nbits)
	temp = awk.JaggedArray.fromcounts(counts,array.flatten())
	the_data.append(pa.array(temp))
	elif isinstance(array,awk.JaggedArray):
	contshape = array.content.shape
	if len(contshape) == 1:
	the_data.append(pa.array(array))
	else:
	nobject = contshape[0]
	nbits = contshape[1]
	bitcounts = np.full(nobject,nbits)
	inner = awk.JaggedArray.fromcounts(bitcounts,array.content.flatten())
	outer = awk.JaggedArray.fromcounts(array.counts,inner)
	the_data.append(pa.ListArray.from_arrays(outer.offsets,outer.flatten()))
	elif isinstance(array,awk.ObjectArray):
	#this only deals with one nested layer of jaggedness (sufficient for now)
	temp = awkward.fromiter(array)
	the_data.append(pa.ListArray.from_arrays(temp.offsets,temp.flatten()))
	del array

	tbl = pa.Table.from_arrays(arrays=the_data,names=the_names)

	n_processed_files += 1

	return tbl, n_processed_files