Created
July 31, 2020 14:27
-
-
Save roymartinezblanco/c1b30439191b32eddd42788cfe8f2a71 to your computer and use it in GitHub Desktop.
Extracts Data and cleans it from HAR
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
colmms = ['url','host','host-type','method','status','ext','cpcode','ttl','server','cdn-cache','cdn-cache-parent','cdn-cache-key','cdn-req-id','vary','appOrigin','content-length','content-length-origin','blocked','dns','ssl','connect','send','ttfb','receive','edgeTime','originTime' | |
] | |
dat_clean = pd.DataFrame(columns=colmms) | |
for r in har['log']['entries']: | |
u = str(r['request']['url']).split('?')[0] | |
host = re.search('://(.+?)/', u, re.IGNORECASE).group(0).replace(':','').replace('/','') | |
cachekey = str(findHeader(r,'response','x-cache-key','eq')) | |
if not cachekey == 'None': | |
cachekey = cachekey.split('/') | |
cpcode = int(cachekey[3]) | |
ttl = cachekey[4] | |
cdnCache = str(findHeader(r,'response','x-cache','eq')).split(' ')[0] | |
cdnCacheParent = str(findHeader(r,'response','x-cache-remote','eq')).split(' ')[0] | |
origin = str(findHeader(r,'response','x-cache-key','eq')).split('/')[5] | |
else: | |
cachekey = "None" | |
cpcode = "None" | |
ttl = "None" | |
cdnCache = "None" | |
cdnCacheParent = "None" | |
origin = "None" | |
ext = re.search(r'(\.[A-Za-z0-9]+$)', u, re.IGNORECASE) | |
if any(tld in host for tld in FirstParty): | |
hostType = 'First Party' | |
else: | |
hostType = 'Third Party' | |
if ext is None: | |
ext = "None" | |
else: | |
ext = ext.group(0).replace('.','') | |
ct = findHeader(r,'response','content-length','eq') | |
if ct == "None": | |
ct = 0 | |
else: | |
ct = int(ct) | |
if ext in ['jpg','png']: | |
ct_origin = findHeader(r,'response','x-im-original-size','eq') | |
else: | |
ct_origin = findHeader(r,'response','x-akamai-ro-origin-size','eq') | |
if ct_origin == "None": | |
ct_origin = 0 | |
else: | |
ct_origin = int(ct_origin) | |
new_row = { | |
'url':u, | |
'host':host, | |
'host-type':hostType, | |
'method':r['request']['method'], | |
'status':r['response']['status'], | |
'ext':ext, | |
'cpcode':cpcode, | |
'ttl':ttl, | |
'server':str(findHeader(r,'response','server','eq')), | |
'cdn-cache':cdnCache, | |
'cdn-cache-parent':cdnCacheParent, | |
'cdn-cache-key':str(findHeader(r,'response','x-true-cache-key','eq')), | |
'cdn-req-id':str(findHeader(r,'response','x-akamai-request-id','eq')), | |
'vary':str(findHeader(r,'response','vary','eq')), | |
'appOrigin':origin, | |
'content-length':ct, | |
'content-length-origin':ct_origin, | |
'blocked':r['timings']['blocked'], | |
'dns':r['timings']['dns'], | |
'ssl':r['timings']['ssl'], | |
'connect':r['timings']['connect'], | |
'send':r['timings']['send'], | |
'ttfb':r['timings']['wait'], | |
'receive':r['timings']['receive'], | |
'edgeTime':findHeader(r,'cdn-timing','edge','eq'), | |
'originTime':findHeader(r,'cdn-timing','origin','eq') | |
} | |
dat_clean = dat_clean.append(new_row,ignore_index=True) | |
dat_clean = dat_clean.groupby(colmms).size().reset_index(name='Count') | |
dat_clean.to_csv(directory+'Output/output.csv',index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment