-
-
Save vkuznet/cce4cb3caf27a96e4f23d937cde4f114 to your computer and use it in GitHub Desktop.
Prototype of dbs blocks function
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
import time | |
import urllib | |
import resource | |
from WMCore.Services.pycurl_manager import getdata as multi_getdata | |
def getBlockInfo(blockNames, ckey, cert): | |
""" | |
Fetch block information details, file list and number of events, from DBS | |
server. | |
:param blockNames: list of block names | |
:return dict: dictionary of {block: {"files": file_list, "nevents": number_events}, ...} | |
""" | |
# TODO: the logic of this function should be implemented concurrently: | |
# - for every given block get information from DBS: list of files and number of events in a block | |
# - please use pycurl_manager.py module and the following logic: | |
urls = [] | |
for blk in blockNames: | |
# need to encode block name properly | |
block = urllib.parse.quote_plus(blk) | |
url = f"https://cmsweb-prod.cern.ch/dbs/prod/global/DBSReader/files?detail=true&block_name={block}" | |
urls.append(url) | |
# place concurrent calls to DBS | |
print(using("before getBlockInfo:multi_getdata")) | |
results = multi_getdata(urls, ckey, cert) | |
print(using("before getBlockInfo:multi_getdata")) | |
# parse output of getdata in some form | |
blockInfo = {} | |
for row in results: | |
rowUrl = row['url'] | |
blk = row['url'].split('block_name=')[-1] | |
block = urllib.parse.unquote_plus(blk) | |
data = json.loads(row['data']) | |
files = [r['logical_file_name'] for r in data] | |
nevents = sum([r['event_count'] for r in data]) | |
blockInfo[block] = {'files': files, 'nevents': nevents} | |
return blockInfo | |
def using(point=""): | |
usage=resource.getrusage(resource.RUSAGE_SELF) | |
return '''%s: usertime=%s systime=%s mem=%s (MB)'''%(point, usage[0], usage[1], usage[2]/1024.0/1024.0 ) | |
if __name__ == '__main__': | |
ckey = os.getenv('X509_USER_KEY') | |
cert = os.getenv('X509_USER_CERT') | |
dataset = "/Neutrino_E-10_gun/RunIISummer20ULPrePremix-UL16_106X_mcRun2_asymptotic_v13-v1/PREMIX" | |
url = f"https://cmsweb-prod.cern.ch/dbs/prod/global/DBSReader/blocks?dataset={dataset}" | |
print(url) | |
print(using("before")) | |
results = multi_getdata([url], ckey, cert) | |
print(using("after")) | |
blocks = [] | |
for row in results: | |
data = json.loads(row['data']) | |
blocks = [r['block_name'] for r in data] | |
print(f"for {dataset} get {len(blocks)}") | |
print(using("after gen iteration")) | |
time0 = time.time() | |
print(using("before getBlockInfo")) | |
results = getBlockInfo(blocks, ckey, cert) | |
print(using("after getBlockInfo")) | |
print("Elapsed time", time.time()-time0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment