vkuznet/dbs_blocks.py Secret

## dbs_blocks.py
import os
import json
import time
import urllib
import resource
from WMCore.Services.pycurl_manager import getdata as multi_getdata

def getBlockInfo(blockNames, ckey, cert):
    """
    Fetch block information details, file list and number of events, from DBS
    server.
    :param blockNames: list of block names
    :return dict: dictionary of {block: {"files": file_list, "nevents": number_events}, ...}
    """
    # TODO: the logic of this function should be implemented concurrently:
    # - for every given block get information from DBS: list of files and number of events in a block
    # - please use pycurl_manager.py module and the following logic:
    urls = []
    for blk in blockNames:
        # need to encode block name properly
        block = urllib.parse.quote_plus(blk)
        url = f"https://cmsweb-prod.cern.ch/dbs/prod/global/DBSReader/files?detail=true&block_name={block}"
        urls.append(url)
    # place concurrent calls to DBS
    print(using("before getBlockInfo:multi_getdata"))
    results = multi_getdata(urls, ckey, cert)
    print(using("before getBlockInfo:multi_getdata"))
    # parse output of getdata in some form
    blockInfo = {}
    for row in results:
        rowUrl = row['url']
        blk = row['url'].split('block_name=')[-1]
        block = urllib.parse.unquote_plus(blk)
        data = json.loads(row['data'])
        files = [r['logical_file_name'] for r in data]
        nevents = sum([r['event_count'] for r in data])
        blockInfo[block] = {'files': files, 'nevents': nevents}
    return blockInfo

def using(point=""):
    usage=resource.getrusage(resource.RUSAGE_SELF)
    return '''%s: usertime=%s systime=%s mem=%s (MB)'''%(point, usage[0], usage[1], usage[2]/1024.0/1024.0 )

if __name__ == '__main__':
    ckey = os.getenv('X509_USER_KEY')
    cert = os.getenv('X509_USER_CERT')
    dataset = "/Neutrino_E-10_gun/RunIISummer20ULPrePremix-UL16_106X_mcRun2_asymptotic_v13-v1/PREMIX"
    url = f"https://cmsweb-prod.cern.ch/dbs/prod/global/DBSReader/blocks?dataset={dataset}"
    print(url)
    print(using("before"))
    results = multi_getdata([url], ckey, cert)
    print(using("after"))
    blocks = []
    for row in results:
        data = json.loads(row['data'])
        blocks = [r['block_name'] for r in data]
    print(f"for {dataset} get {len(blocks)}")
    print(using("after gen iteration"))

    time0 = time.time()
    print(using("before getBlockInfo"))
    results = getBlockInfo(blocks, ckey, cert)
    print(using("after getBlockInfo"))
    print("Elapsed time", time.time()-time0)
	import os
	import json
	import time
	import urllib
	import resource
	from WMCore.Services.pycurl_manager import getdata as multi_getdata

	def getBlockInfo(blockNames, ckey, cert):
	"""
	Fetch block information details, file list and number of events, from DBS
	server.
	:param blockNames: list of block names
	:return dict: dictionary of {block: {"files": file_list, "nevents": number_events}, ...}
	"""
	# TODO: the logic of this function should be implemented concurrently:
	# - for every given block get information from DBS: list of files and number of events in a block
	# - please use pycurl_manager.py module and the following logic:
	urls = []
	for blk in blockNames:
	# need to encode block name properly
	block = urllib.parse.quote_plus(blk)
	url = f"https://cmsweb-prod.cern.ch/dbs/prod/global/DBSReader/files?detail=true&block_name={block}"
	urls.append(url)
	# place concurrent calls to DBS
	print(using("before getBlockInfo:multi_getdata"))
	results = multi_getdata(urls, ckey, cert)
	print(using("before getBlockInfo:multi_getdata"))
	# parse output of getdata in some form
	blockInfo = {}
	for row in results:
	rowUrl = row['url']
	blk = row['url'].split('block_name=')[-1]
	block = urllib.parse.unquote_plus(blk)
	data = json.loads(row['data'])
	files = [r['logical_file_name'] for r in data]
	nevents = sum([r['event_count'] for r in data])
	blockInfo[block] = {'files': files, 'nevents': nevents}
	return blockInfo

	def using(point=""):
	usage=resource.getrusage(resource.RUSAGE_SELF)
	return '''%s: usertime=%s systime=%s mem=%s (MB)'''%(point, usage[0], usage[1], usage[2]/1024.0/1024.0 )

	if __name__ == '__main__':
	ckey = os.getenv('X509_USER_KEY')
	cert = os.getenv('X509_USER_CERT')
	dataset = "/Neutrino_E-10_gun/RunIISummer20ULPrePremix-UL16_106X_mcRun2_asymptotic_v13-v1/PREMIX"
	url = f"https://cmsweb-prod.cern.ch/dbs/prod/global/DBSReader/blocks?dataset={dataset}"
	print(url)
	print(using("before"))
	results = multi_getdata([url], ckey, cert)
	print(using("after"))
	blocks = []
	for row in results:
	data = json.loads(row['data'])
	blocks = [r['block_name'] for r in data]
	print(f"for {dataset} get {len(blocks)}")
	print(using("after gen iteration"))

	time0 = time.time()
	print(using("before getBlockInfo"))
	results = getBlockInfo(blocks, ckey, cert)
	print(using("after getBlockInfo"))
	print("Elapsed time", time.time()-time0)