Skip to content

Instantly share code, notes, and snippets.

@lgray
Last active January 10, 2020 14:12
Show Gist options
  • Save lgray/dbb545716d97528bbd09933929ff5a56 to your computer and use it in GitHub Desktop.
Save lgray/dbb545716d97528bbd09933929ff5a56 to your computer and use it in GitHub Desktop.
import pyarrow as pa
import pyarrow.parquet as pq
def nanoaod2arrowtable(params):
"""
takes as input a (list of) root file(s) of ~flat ntuples
and convert into a single arrow table
"""
random.seed(None)
inputFilesNames=[params[0]]
outputDataSet=params[1]
key_groups = {}
key_groups_made = False
n_events_current = 0
n_processed_files = 0
if not isinstance(inputFilesNames,list):
fileList=[]
fileList.append(inputFilesNames)
inputFilesNames = fileList
assert(len(inputFilesNames) == 1)
out={}
for f in inputFilesNames:
#print("== B2P: Processing file %s"%f)
with uproot.open(f) as rootf:
if len(list(rootf.keys())) == 0 or "Events" not in rootf:
return n_processed_files
events = rootf["Events"]
n_events_current += events.numentries
the_data = []
the_names = []
dupes = {}
for keyname in events.keys():
array = events.arrays([keyname])[keyname]
outname = keyname.decode()
if outname in the_names:
if outname not in dupes.keys():
dupes[outname] = 0
dupes[outname] += 1
outname = outname + '_' + str(dupes[outname])
the_names.append(outname)
if isinstance(array,np.ndarray):
arrshape = array.shape
if len(arrshape) == 1:
the_data.append(pa.array(array))
else:
nevents = arrshape[0]
nbits = arrshape[1]
counts = np.full(nevents,nbits)
temp = awk.JaggedArray.fromcounts(counts,array.flatten())
the_data.append(pa.array(temp))
elif isinstance(array,awk.JaggedArray):
contshape = array.content.shape
if len(contshape) == 1:
the_data.append(pa.array(array))
else:
nobject = contshape[0]
nbits = contshape[1]
bitcounts = np.full(nobject,nbits)
inner = awk.JaggedArray.fromcounts(bitcounts,array.content.flatten())
outer = awk.JaggedArray.fromcounts(array.counts,inner)
the_data.append(pa.ListArray.from_arrays(outer.offsets,outer.flatten()))
elif isinstance(array,awk.ObjectArray):
#this only deals with one nested layer of jaggedness (sufficient for now)
temp = awkward.fromiter(array)
the_data.append(pa.ListArray.from_arrays(temp.offsets,temp.flatten()))
del array
tbl = pa.Table.from_arrays(arrays=the_data,names=the_names)
n_processed_files += 1
return tbl, n_processed_files
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment