Created
September 9, 2019 20:01
-
-
Save karenc/8f335da24607c84ec7ac50af03397ca8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# https://github.com/openstax/cnx/issues/669 | |
# Oops error when navigating to the Psychology book through the CNX homepage | |
# Get the varnish files here: | |
# scp homedirs1.cnx.org:/home/pumazi/p0{1,2}_905_varn.out . | |
# Run this script by doing: | |
# ./parse_varnish_logs.py p01_905_varn.out p02_905_varn.out | |
# example request: | |
# | |
# * << BeReq >> 4292886 | |
# - Begin bereq 4292885 fetch | |
# - Timestamp Start: 1567685158.431862 0.000000 0.000000 | |
# - BereqMethod GET | |
# - BereqURL /extras/8d04a686-d5e8-4798-a27d-c608e4d0e187@26.19:72169e9d-cba1-4d43-923c-41ae4d01b3cc@11 | |
# - BereqProtocol HTTP/1.1 | |
# - BereqHeader Host: archive.cnx.org | |
# - BereqHeader Accept: application/json, text/javascript, */*; q=0.01 | |
# - BereqHeader User-Agent: Mozilla/5.0 (Linux; Android 8.1.0; SM-T580) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36 | |
# - BereqHeader Sec-Fetch-Mode: cors | |
# - BereqHeader Origin: https://cnx.org | |
# - BereqHeader Sec-Fetch-Site: same-site | |
# - BereqHeader Referer: https://cnx.org/contents/jQSmhtXo@26.19:chaencuh@11/2-1-Displacement | |
# - BereqHeader Accept-Language: en-US,en;q=0.9,lt;q=0.8 | |
# - BereqHeader X-Secure: true | |
# - BereqHeader X-Forwarded-Proto: https | |
# - BereqHeader X-Forwarded-For: 69.67.85.194, 128.42.169.27 | |
# - BereqHeader Accept-Encoding: gzip | |
# - BereqHeader If-None-Match: "hIazt05ahVXndnutCKlE9Q" | |
# - BereqHeader X-Varnish: 4292886 | |
# - VCL_call BACKEND_FETCH | |
# - VCL_return fetch | |
# - BackendOpen 52 15f4883d-5d93-4101-b6b7-25ad0022d42e.prod08_archive0 128.42.169.78 6500 128.42.169.89 56142 | |
# - Timestamp Bereq: 1567685158.434152 0.002290 0.002290 | |
# - Timestamp Beresp: 1567685159.549903 1.118041 1.115751 | |
# - BerespProtocol HTTP/1.1 | |
# - BerespStatus 500 | |
# - BerespReason Internal Server Error | |
# - BerespHeader Content-Length: 110 | |
# - BerespHeader Content-Type: text/plain | |
# - BerespHeader Date: Thu, 05 Sep 2019 11:38:35 GMT | |
# - BerespHeader Server: waitress | |
# - TTL RFC -1 10 -1 1567685160 1567685160 1567683515 0 0 | |
# - VCL_call BACKEND_RESPONSE | |
# - TTL VCL 0 10 0 1567685160 | |
# - BerespHeader X-Varnish-Status: uncacheable - status code >= 500 | |
# - BerespHeader X-Varnish-Backend: prod08_archive0 | |
# - BerespHeader X-Varnish-Ttl: 0.000 | |
# - VCL_return deliver | |
# - Storage malloc Transient | |
# - ObjProtocol HTTP/1.1 | |
# - ObjStatus 500 | |
# - ObjReason Internal Server Error | |
# - ObjHeader Content-Length: 110 | |
# - ObjHeader Content-Type: text/plain | |
# - ObjHeader Date: Thu, 05 Sep 2019 11:38:35 GMT | |
# - ObjHeader Server: waitress | |
# - ObjHeader X-Varnish-Status: uncacheable - status code >= 500 | |
# - ObjHeader X-Varnish-Backend: prod08_archive0 | |
# - ObjHeader X-Varnish-Ttl: 0.000 | |
# - Fetch_Body 3 length stream | |
# - BackendReuse 52 15f4883d-5d93-4101-b6b7-25ad0022d42e.prod08_archive0 | |
# - Timestamp BerespBody: 1567685159.550290 1.118428 0.000387 | |
# - Length 110 | |
# - BereqAcct 684 0 684 140 110 250 | |
# - End | |
import json | |
import re | |
import sys | |
class Request: | |
def __init__(self, lines): | |
self.bereq = lines[0].split('>>')[-1].strip() | |
self.fields = {} | |
for line in lines[1:-1]: | |
_, header, value = re.split(r'\s+', line, 2) | |
if 'Header' in header: | |
k, v = value.split(':', 1) | |
value = {k: v.strip(), | |
k.lower(): v.strip()} | |
if header not in self.fields: | |
self.fields[header] = value | |
elif isinstance(self.fields[header], dict): | |
self.fields[header].update(value) | |
elif isinstance(self.fields[header], list): | |
self.fields[header].append(value) | |
else: | |
self.fields[header] = [self.fields[header], value] | |
def __getattr__(self, attr): | |
if attr in self.fields: | |
return self.fields[attr] | |
raise AttributeError | |
def __str__(self): | |
return f'<Request BeReq={self.bereq} BereqURL={self.BereqURL} BerespStatus={self.BerespStatus}>' | |
if __name__ == '__main__': | |
requests = [] | |
for filename in sys.argv[1:]: | |
with open(filename) as f: | |
batch = [] | |
for line in f.readlines(): | |
if not line.strip() and batch: | |
requests.append(Request(batch)) | |
batch = [] | |
else: | |
batch.append(line.strip()) | |
for req in requests: | |
# For header fields like BereqHeader, ObjHeader, BerespHeader, you can do: | |
# req.BereqHeader['user-agent'] (<- lower case header for consistency) | |
# For fields that appear multiple times like Timestamp | |
# req.Timestamp[0] | |
# For other fields, just the field name should work | |
# req.BerespStatus | |
print(req.BerespStatus) | |
with open('prod_varnish_requests.json', 'w') as f: | |
json.dump([r.fields for r in requests], f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment