Skip to content

Instantly share code, notes, and snippets.

@roymartinezblanco
Created July 31, 2020 14:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save roymartinezblanco/c1b30439191b32eddd42788cfe8f2a71 to your computer and use it in GitHub Desktop.
Save roymartinezblanco/c1b30439191b32eddd42788cfe8f2a71 to your computer and use it in GitHub Desktop.
Extracts Data and cleans it from HAR
colmms = ['url','host','host-type','method','status','ext','cpcode','ttl','server','cdn-cache','cdn-cache-parent','cdn-cache-key','cdn-req-id','vary','appOrigin','content-length','content-length-origin','blocked','dns','ssl','connect','send','ttfb','receive','edgeTime','originTime'
]
dat_clean = pd.DataFrame(columns=colmms)
for r in har['log']['entries']:
u = str(r['request']['url']).split('?')[0]
host = re.search('://(.+?)/', u, re.IGNORECASE).group(0).replace(':','').replace('/','')
cachekey = str(findHeader(r,'response','x-cache-key','eq'))
if not cachekey == 'None':
cachekey = cachekey.split('/')
cpcode = int(cachekey[3])
ttl = cachekey[4]
cdnCache = str(findHeader(r,'response','x-cache','eq')).split(' ')[0]
cdnCacheParent = str(findHeader(r,'response','x-cache-remote','eq')).split(' ')[0]
origin = str(findHeader(r,'response','x-cache-key','eq')).split('/')[5]
else:
cachekey = "None"
cpcode = "None"
ttl = "None"
cdnCache = "None"
cdnCacheParent = "None"
origin = "None"
ext = re.search(r'(\.[A-Za-z0-9]+$)', u, re.IGNORECASE)
if any(tld in host for tld in FirstParty):
hostType = 'First Party'
else:
hostType = 'Third Party'
if ext is None:
ext = "None"
else:
ext = ext.group(0).replace('.','')
ct = findHeader(r,'response','content-length','eq')
if ct == "None":
ct = 0
else:
ct = int(ct)
if ext in ['jpg','png']:
ct_origin = findHeader(r,'response','x-im-original-size','eq')
else:
ct_origin = findHeader(r,'response','x-akamai-ro-origin-size','eq')
if ct_origin == "None":
ct_origin = 0
else:
ct_origin = int(ct_origin)
new_row = {
'url':u,
'host':host,
'host-type':hostType,
'method':r['request']['method'],
'status':r['response']['status'],
'ext':ext,
'cpcode':cpcode,
'ttl':ttl,
'server':str(findHeader(r,'response','server','eq')),
'cdn-cache':cdnCache,
'cdn-cache-parent':cdnCacheParent,
'cdn-cache-key':str(findHeader(r,'response','x-true-cache-key','eq')),
'cdn-req-id':str(findHeader(r,'response','x-akamai-request-id','eq')),
'vary':str(findHeader(r,'response','vary','eq')),
'appOrigin':origin,
'content-length':ct,
'content-length-origin':ct_origin,
'blocked':r['timings']['blocked'],
'dns':r['timings']['dns'],
'ssl':r['timings']['ssl'],
'connect':r['timings']['connect'],
'send':r['timings']['send'],
'ttfb':r['timings']['wait'],
'receive':r['timings']['receive'],
'edgeTime':findHeader(r,'cdn-timing','edge','eq'),
'originTime':findHeader(r,'cdn-timing','origin','eq')
}
dat_clean = dat_clean.append(new_row,ignore_index=True)
dat_clean = dat_clean.groupby(colmms).size().reset_index(name='Count')
dat_clean.to_csv(directory+'Output/output.csv',index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment