Skip to content

Instantly share code, notes, and snippets.

@vkuznet
Last active February 27, 2024 17:51
Show Gist options
  • Save vkuznet/cce4cb3caf27a96e4f23d937cde4f114 to your computer and use it in GitHub Desktop.
Save vkuznet/cce4cb3caf27a96e4f23d937cde4f114 to your computer and use it in GitHub Desktop.
Prototype of dbs blocks function
import os
import json
import time
import urllib
import resource
from WMCore.Services.pycurl_manager import getdata as multi_getdata
def getBlockInfo(blockNames, ckey, cert):
"""
Fetch block information details, file list and number of events, from DBS
server.
:param blockNames: list of block names
:return dict: dictionary of {block: {"files": file_list, "nevents": number_events}, ...}
"""
# TODO: the logic of this function should be implemented concurrently:
# - for every given block get information from DBS: list of files and number of events in a block
# - please use pycurl_manager.py module and the following logic:
urls = []
for blk in blockNames:
# need to encode block name properly
block = urllib.parse.quote_plus(blk)
url = f"https://cmsweb-prod.cern.ch/dbs/prod/global/DBSReader/files?detail=true&block_name={block}"
urls.append(url)
# place concurrent calls to DBS
print(using("before getBlockInfo:multi_getdata"))
results = multi_getdata(urls, ckey, cert)
print(using("before getBlockInfo:multi_getdata"))
# parse output of getdata in some form
blockInfo = {}
for row in results:
rowUrl = row['url']
blk = row['url'].split('block_name=')[-1]
block = urllib.parse.unquote_plus(blk)
data = json.loads(row['data'])
files = [r['logical_file_name'] for r in data]
nevents = sum([r['event_count'] for r in data])
blockInfo[block] = {'files': files, 'nevents': nevents}
return blockInfo
def using(point=""):
usage=resource.getrusage(resource.RUSAGE_SELF)
return '''%s: usertime=%s systime=%s mem=%s (MB)'''%(point, usage[0], usage[1], usage[2]/1024.0/1024.0 )
if __name__ == '__main__':
ckey = os.getenv('X509_USER_KEY')
cert = os.getenv('X509_USER_CERT')
dataset = "/Neutrino_E-10_gun/RunIISummer20ULPrePremix-UL16_106X_mcRun2_asymptotic_v13-v1/PREMIX"
url = f"https://cmsweb-prod.cern.ch/dbs/prod/global/DBSReader/blocks?dataset={dataset}"
print(url)
print(using("before"))
results = multi_getdata([url], ckey, cert)
print(using("after"))
blocks = []
for row in results:
data = json.loads(row['data'])
blocks = [r['block_name'] for r in data]
print(f"for {dataset} get {len(blocks)}")
print(using("after gen iteration"))
time0 = time.time()
print(using("before getBlockInfo"))
results = getBlockInfo(blocks, ckey, cert)
print(using("after getBlockInfo"))
print("Elapsed time", time.time()-time0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment