karenc/parse_varnish_logs.py

## parse_varnish_logs.py
#!/usr/bin/env python3

# https://github.com/openstax/cnx/issues/669
# Oops error when navigating to the Psychology book through the CNX homepage

# Get the varnish files here:
#    scp homedirs1.cnx.org:/home/pumazi/p0{1,2}_905_varn.out .

# Run this script by doing:
#    ./parse_varnish_logs.py p01_905_varn.out p02_905_varn.out

# example request:
#
# *   << BeReq    >> 4292886
# -   Begin          bereq 4292885 fetch
# -   Timestamp      Start: 1567685158.431862 0.000000 0.000000
# -   BereqMethod    GET
# -   BereqURL       /extras/8d04a686-d5e8-4798-a27d-c608e4d0e187@26.19:72169e9d-cba1-4d43-923c-41ae4d01b3cc@11
# -   BereqProtocol  HTTP/1.1
# -   BereqHeader    Host: archive.cnx.org
# -   BereqHeader    Accept: application/json, text/javascript, */*; q=0.01
# -   BereqHeader    User-Agent: Mozilla/5.0 (Linux; Android 8.1.0; SM-T580) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36
# -   BereqHeader    Sec-Fetch-Mode: cors
# -   BereqHeader    Origin: https://cnx.org
# -   BereqHeader    Sec-Fetch-Site: same-site
# -   BereqHeader    Referer: https://cnx.org/contents/jQSmhtXo@26.19:chaencuh@11/2-1-Displacement
# -   BereqHeader    Accept-Language: en-US,en;q=0.9,lt;q=0.8
# -   BereqHeader    X-Secure: true
# -   BereqHeader    X-Forwarded-Proto: https
# -   BereqHeader    X-Forwarded-For: 69.67.85.194, 128.42.169.27
# -   BereqHeader    Accept-Encoding: gzip
# -   BereqHeader    If-None-Match: "hIazt05ahVXndnutCKlE9Q"
# -   BereqHeader    X-Varnish: 4292886
# -   VCL_call       BACKEND_FETCH
# -   VCL_return     fetch
# -   BackendOpen    52 15f4883d-5d93-4101-b6b7-25ad0022d42e.prod08_archive0 128.42.169.78 6500 128.42.169.89 56142
# -   Timestamp      Bereq: 1567685158.434152 0.002290 0.002290
# -   Timestamp      Beresp: 1567685159.549903 1.118041 1.115751
# -   BerespProtocol HTTP/1.1
# -   BerespStatus   500
# -   BerespReason   Internal Server Error
# -   BerespHeader   Content-Length: 110
# -   BerespHeader   Content-Type: text/plain
# -   BerespHeader   Date: Thu, 05 Sep 2019 11:38:35 GMT
# -   BerespHeader   Server: waitress
# -   TTL            RFC -1 10 -1 1567685160 1567685160 1567683515 0 0
# -   VCL_call       BACKEND_RESPONSE
# -   TTL            VCL 0 10 0 1567685160
# -   BerespHeader   X-Varnish-Status: uncacheable - status code >= 500
# -   BerespHeader   X-Varnish-Backend: prod08_archive0
# -   BerespHeader   X-Varnish-Ttl: 0.000
# -   VCL_return     deliver
# -   Storage        malloc Transient
# -   ObjProtocol    HTTP/1.1
# -   ObjStatus      500
# -   ObjReason      Internal Server Error
# -   ObjHeader      Content-Length: 110
# -   ObjHeader      Content-Type: text/plain
# -   ObjHeader      Date: Thu, 05 Sep 2019 11:38:35 GMT
# -   ObjHeader      Server: waitress
# -   ObjHeader      X-Varnish-Status: uncacheable - status code >= 500
# -   ObjHeader      X-Varnish-Backend: prod08_archive0
# -   ObjHeader      X-Varnish-Ttl: 0.000
# -   Fetch_Body     3 length stream
# -   BackendReuse   52 15f4883d-5d93-4101-b6b7-25ad0022d42e.prod08_archive0
# -   Timestamp      BerespBody: 1567685159.550290 1.118428 0.000387
# -   Length         110
# -   BereqAcct      684 0 684 140 110 250
# -   End


import json
import re
import sys


class Request:
    def __init__(self, lines):
        self.bereq = lines[0].split('>>')[-1].strip()
        self.fields = {}
        for line in lines[1:-1]:
            _, header, value = re.split(r'\s+', line, 2)
            if 'Header' in header:
                k, v = value.split(':', 1)
                value = {k: v.strip(),
                         k.lower(): v.strip()}
            if header not in self.fields:
                self.fields[header] = value
            elif isinstance(self.fields[header], dict):
                self.fields[header].update(value)
            elif isinstance(self.fields[header], list):
                self.fields[header].append(value)
            else:
                self.fields[header] = [self.fields[header], value]

    def __getattr__(self, attr):
        if attr in self.fields:
            return self.fields[attr]
        raise AttributeError

    def __str__(self):
        return f'<Request BeReq={self.bereq} BereqURL={self.BereqURL} BerespStatus={self.BerespStatus}>'


if __name__ == '__main__':
    requests = []
    for filename in sys.argv[1:]:
        with open(filename) as f:
            batch = []
            for line in f.readlines():
                if not line.strip() and batch:
                    requests.append(Request(batch))
                    batch = []
                else:
                    batch.append(line.strip())

    for req in requests:
        # For header fields like BereqHeader, ObjHeader, BerespHeader, you can do:
        #     req.BereqHeader['user-agent'] (<- lower case header for consistency)
        # For fields that appear multiple times like Timestamp
        #     req.Timestamp[0]
        # For other fields, just the field name should work
        #     req.BerespStatus
        print(req.BerespStatus)

    with open('prod_varnish_requests.json', 'w') as f:
        json.dump([r.fields for r in requests], f)
	#!/usr/bin/env python3

	# https://github.com/openstax/cnx/issues/669
	# Oops error when navigating to the Psychology book through the CNX homepage

	# Get the varnish files here:
	# scp homedirs1.cnx.org:/home/pumazi/p0{1,2}_905_varn.out .

	# Run this script by doing:
	# ./parse_varnish_logs.py p01_905_varn.out p02_905_varn.out

	# example request:
	#
	# * << BeReq >> 4292886
	# - Begin bereq 4292885 fetch
	# - Timestamp Start: 1567685158.431862 0.000000 0.000000
	# - BereqMethod GET
	# - BereqURL /extras/8d04a686-d5e8-4798-a27d-c608e4d0e187@26.19:72169e9d-cba1-4d43-923c-41ae4d01b3cc@11
	# - BereqProtocol HTTP/1.1
	# - BereqHeader Host: archive.cnx.org
	# - BereqHeader Accept: application/json, text/javascript, /; q=0.01
	# - BereqHeader User-Agent: Mozilla/5.0 (Linux; Android 8.1.0; SM-T580) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36
	# - BereqHeader Sec-Fetch-Mode: cors
	# - BereqHeader Origin: https://cnx.org
	# - BereqHeader Sec-Fetch-Site: same-site
	# - BereqHeader Referer: https://cnx.org/contents/jQSmhtXo@26.19:chaencuh@11/2-1-Displacement
	# - BereqHeader Accept-Language: en-US,en;q=0.9,lt;q=0.8
	# - BereqHeader X-Secure: true
	# - BereqHeader X-Forwarded-Proto: https
	# - BereqHeader X-Forwarded-For: 69.67.85.194, 128.42.169.27
	# - BereqHeader Accept-Encoding: gzip
	# - BereqHeader If-None-Match: "hIazt05ahVXndnutCKlE9Q"
	# - BereqHeader X-Varnish: 4292886
	# - VCL_call BACKEND_FETCH
	# - VCL_return fetch
	# - BackendOpen 52 15f4883d-5d93-4101-b6b7-25ad0022d42e.prod08_archive0 128.42.169.78 6500 128.42.169.89 56142
	# - Timestamp Bereq: 1567685158.434152 0.002290 0.002290
	# - Timestamp Beresp: 1567685159.549903 1.118041 1.115751
	# - BerespProtocol HTTP/1.1
	# - BerespStatus 500
	# - BerespReason Internal Server Error
	# - BerespHeader Content-Length: 110
	# - BerespHeader Content-Type: text/plain
	# - BerespHeader Date: Thu, 05 Sep 2019 11:38:35 GMT
	# - BerespHeader Server: waitress
	# - TTL RFC -1 10 -1 1567685160 1567685160 1567683515 0 0
	# - VCL_call BACKEND_RESPONSE
	# - TTL VCL 0 10 0 1567685160
	# - BerespHeader X-Varnish-Status: uncacheable - status code >= 500
	# - BerespHeader X-Varnish-Backend: prod08_archive0
	# - BerespHeader X-Varnish-Ttl: 0.000
	# - VCL_return deliver
	# - Storage malloc Transient
	# - ObjProtocol HTTP/1.1
	# - ObjStatus 500
	# - ObjReason Internal Server Error
	# - ObjHeader Content-Length: 110
	# - ObjHeader Content-Type: text/plain
	# - ObjHeader Date: Thu, 05 Sep 2019 11:38:35 GMT
	# - ObjHeader Server: waitress
	# - ObjHeader X-Varnish-Status: uncacheable - status code >= 500
	# - ObjHeader X-Varnish-Backend: prod08_archive0
	# - ObjHeader X-Varnish-Ttl: 0.000
	# - Fetch_Body 3 length stream
	# - BackendReuse 52 15f4883d-5d93-4101-b6b7-25ad0022d42e.prod08_archive0
	# - Timestamp BerespBody: 1567685159.550290 1.118428 0.000387
	# - Length 110
	# - BereqAcct 684 0 684 140 110 250
	# - End


	import json
	import re
	import sys


	class Request:
	def __init__(self, lines):
	self.bereq = lines[0].split('>>')[-1].strip()
	self.fields = {}
	for line in lines[1:-1]:
	_, header, value = re.split(r'\s+', line, 2)
	if 'Header' in header:
	k, v = value.split(':', 1)
	value = {k: v.strip(),
	k.lower(): v.strip()}
	if header not in self.fields:
	self.fields[header] = value
	elif isinstance(self.fields[header], dict):
	self.fields[header].update(value)
	elif isinstance(self.fields[header], list):
	self.fields[header].append(value)
	else:
	self.fields[header] = [self.fields[header], value]

	def __getattr__(self, attr):
	if attr in self.fields:
	return self.fields[attr]
	raise AttributeError

	def __str__(self):
	return f'<Request BeReq={self.bereq} BereqURL={self.BereqURL} BerespStatus={self.BerespStatus}>'


	if __name__ == '__main__':
	requests = []
	for filename in sys.argv[1:]:
	with open(filename) as f:
	batch = []
	for line in f.readlines():
	if not line.strip() and batch:
	requests.append(Request(batch))
	batch = []
	else:
	batch.append(line.strip())

	for req in requests:
	# For header fields like BereqHeader, ObjHeader, BerespHeader, you can do:
	# req.BereqHeader['user-agent'] (<- lower case header for consistency)
	# For fields that appear multiple times like Timestamp
	# req.Timestamp[0]
	# For other fields, just the field name should work
	# req.BerespStatus
	print(req.BerespStatus)

	with open('prod_varnish_requests.json', 'w') as f:
	json.dump([r.fields for r in requests], f)