Created
June 3, 2016 16:47
-
-
Save jpata/00887e8f3786877b70f63fc5908c6e9d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import dask | |
import sys, glob | |
import numpy as np | |
def fixPath(): | |
newpath = [] | |
for v in sys.path: | |
if "cvmfs" in v and "pandas" in v: | |
continue | |
newpath += [v] | |
return newpath | |
sys.path = fixPath() | |
import ROOT, pandas | |
ROOT.gROOT.SetBatch(True) | |
import dask.multiprocessing | |
import root_numpy as rnpy | |
import dask.dataframe as dd | |
flist = [ | |
"/store/user/jpata/tth/VHBBHeppyV21_tthbbV8_May2/ttHTobb_M125_13TeV_powheg_pythia8/VHBBHeppyV21_tthbbV8_May2_ttHTobb_M125_13TeV_powheg_Py8__fall15MAv2-pu25ns15v1_76r2as_v12-v1/160502_210336/0000/tree_1.root", | |
"/store/user/jpata/tth/VHBBHeppyV21_tthbbV8_May2/ttHTobb_M125_13TeV_powheg_pythia8/VHBBHeppyV21_tthbbV8_May2_ttHTobb_M125_13TeV_powheg_Py8__fall15MAv2-pu25ns15v1_76r2as_v12-v1/160502_210336/0000/tree_2.root", | |
"/store/user/jpata/tth/VHBBHeppyV21_tthbbV8_May2/ttHTobb_M125_13TeV_powheg_pythia8/VHBBHeppyV21_tthbbV8_May2_ttHTobb_M125_13TeV_powheg_Py8__fall15MAv2-pu25ns15v1_76r2as_v12-v1/160502_210336/0000/tree_3.root", | |
"/store/user/jpata/tth/VHBBHeppyV21_tthbbV8_May2/ttHTobb_M125_13TeV_powheg_pythia8/VHBBHeppyV21_tthbbV8_May2_ttHTobb_M125_13TeV_powheg_Py8__fall15MAv2-pu25ns15v1_76r2as_v12-v1/160502_210336/0000/tree_4.root", | |
"/store/user/jpata/tth/VHBBHeppyV21_tthbbV8_May2/ttHTobb_M125_13TeV_powheg_pythia8/VHBBHeppyV21_tthbbV8_May2_ttHTobb_M125_13TeV_powheg_Py8__fall15MAv2-pu25ns15v1_76r2as_v12-v1/160502_210336/0000/tree_5.root", | |
"/store/user/jpata/tth/VHBBHeppyV21_tthbbV8_May2/ttHTobb_M125_13TeV_powheg_pythia8/VHBBHeppyV21_tthbbV8_May2_ttHTobb_M125_13TeV_powheg_Py8__fall15MAv2-pu25ns15v1_76r2as_v12-v1/160502_210336/0000/tree_6.root", | |
"/store/user/jpata/tth/VHBBHeppyV21_tthbbV8_May2/ttHTobb_M125_13TeV_powheg_pythia8/VHBBHeppyV21_tthbbV8_May2_ttHTobb_M125_13TeV_powheg_Py8__fall15MAv2-pu25ns15v1_76r2as_v12-v1/160502_210336/0000/tree_7.root", | |
"/store/user/jpata/tth/VHBBHeppyV21_tthbbV8_May2/ttHTobb_M125_13TeV_powheg_pythia8/VHBBHeppyV21_tthbbV8_May2_ttHTobb_M125_13TeV_powheg_Py8__fall15MAv2-pu25ns15v1_76r2as_v12-v1/160502_210336/0000/tree_8.root", | |
"/store/user/jpata/tth/VHBBHeppyV21_tthbbV8_May2/ttHTobb_M125_13TeV_powheg_pythia8/VHBBHeppyV21_tthbbV8_May2_ttHTobb_M125_13TeV_powheg_Py8__fall15MAv2-pu25ns15v1_76r2as_v12-v1/160502_210336/0000/tree_9.root", | |
"/store/user/jpata/tth/VHBBHeppyV21_tthbbV8_May2/ttHTobb_M125_13TeV_powheg_pythia8/VHBBHeppyV21_tthbbV8_May2_ttHTobb_M125_13TeV_powheg_Py8__fall15MAv2-pu25ns15v1_76r2as_v12-v1/160502_210336/0000/tree_10.root", | |
] | |
def read_root(fn, **kwargs): | |
#print fn | |
kwargs_2 = {} | |
if kwargs.has_key("columns"): | |
kwargs_2["branches"] = kwargs["columns"] | |
#pref = "root://cms-xrd-global.cern.ch/" | |
pref = "root://storage01.lcg.cscs.ch/pnfs/lcg.cscs.ch/cms/trivcat" | |
#pref = "file://" | |
tf = ROOT.TFile.Open(pref + fn) | |
if tf == None: | |
raise Exception("Could not open {0}".format(pref + fn)) | |
data = rnpy.tree2rec(tf.Get("tree"), **kwargs_2) | |
tf.Close() | |
return pandas.DataFrame(data) | |
good_flist = [] | |
for f in flist: | |
try: | |
d = read_root(f, columns=['numJets', 'jets_pt']) | |
good_flist += [f] | |
except: | |
print f | |
print len(good_flist) | |
name = "mydf" | |
dsk = {} | |
for i, fi in enumerate(good_flist): | |
dsk[(name, i)] = (read_root, fi) | |
columns = ['numJets', 'jets_pt', 'jets_eta', 'nBCSVM'] | |
divisions = range(len(flist)+1) | |
df = dd.DataFrame(dsk, name, columns, divisions) | |
import time | |
print df.columns | |
t0 = time.time() | |
print df[df.nBCSVM >= 4].numJets.mean().compute() | |
t1 = time.time() | |
print t1-t0 | |
from multiprocessing.pool import ThreadPool | |
dask.set_options(pool=ThreadPool(10)) | |
t0 = time.time() | |
print df[df.nBCSVM >= 4].numJets.mean().compute(num_workers=10) | |
t1 = time.time() | |
print t1-t0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment