Last active
January 10, 2020 14:12
-
-
Save lgray/dbb545716d97528bbd09933929ff5a56 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pyarrow as pa | |
import pyarrow.parquet as pq | |
def nanoaod2arrowtable(params): | |
""" | |
takes as input a (list of) root file(s) of ~flat ntuples | |
and convert into a single arrow table | |
""" | |
random.seed(None) | |
inputFilesNames=[params[0]] | |
outputDataSet=params[1] | |
key_groups = {} | |
key_groups_made = False | |
n_events_current = 0 | |
n_processed_files = 0 | |
if not isinstance(inputFilesNames,list): | |
fileList=[] | |
fileList.append(inputFilesNames) | |
inputFilesNames = fileList | |
assert(len(inputFilesNames) == 1) | |
out={} | |
for f in inputFilesNames: | |
#print("== B2P: Processing file %s"%f) | |
with uproot.open(f) as rootf: | |
if len(list(rootf.keys())) == 0 or "Events" not in rootf: | |
return n_processed_files | |
events = rootf["Events"] | |
n_events_current += events.numentries | |
the_data = [] | |
the_names = [] | |
dupes = {} | |
for keyname in events.keys(): | |
array = events.arrays([keyname])[keyname] | |
outname = keyname.decode() | |
if outname in the_names: | |
if outname not in dupes.keys(): | |
dupes[outname] = 0 | |
dupes[outname] += 1 | |
outname = outname + '_' + str(dupes[outname]) | |
the_names.append(outname) | |
if isinstance(array,np.ndarray): | |
arrshape = array.shape | |
if len(arrshape) == 1: | |
the_data.append(pa.array(array)) | |
else: | |
nevents = arrshape[0] | |
nbits = arrshape[1] | |
counts = np.full(nevents,nbits) | |
temp = awk.JaggedArray.fromcounts(counts,array.flatten()) | |
the_data.append(pa.array(temp)) | |
elif isinstance(array,awk.JaggedArray): | |
contshape = array.content.shape | |
if len(contshape) == 1: | |
the_data.append(pa.array(array)) | |
else: | |
nobject = contshape[0] | |
nbits = contshape[1] | |
bitcounts = np.full(nobject,nbits) | |
inner = awk.JaggedArray.fromcounts(bitcounts,array.content.flatten()) | |
outer = awk.JaggedArray.fromcounts(array.counts,inner) | |
the_data.append(pa.ListArray.from_arrays(outer.offsets,outer.flatten())) | |
elif isinstance(array,awk.ObjectArray): | |
#this only deals with one nested layer of jaggedness (sufficient for now) | |
temp = awkward.fromiter(array) | |
the_data.append(pa.ListArray.from_arrays(temp.offsets,temp.flatten())) | |
del array | |
tbl = pa.Table.from_arrays(arrays=the_data,names=the_names) | |
n_processed_files += 1 | |
return tbl, n_processed_files |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment