michael-lazar/parse_log.py

## parse_log.py
#!/usr/bin/env python3
"""
Write a script that, given a web server log file, returns the 10 most
frequently requested objects and their cumulative bytes transferred.
Only include GET requests with Successful (HTTP 2xx) responses.
Resolve ties however you’d like.

Log format:
- request date, time, and time zone
- request line from the client
- HTTP status code returned to the client
- size (in bytes) of the returned object

Given this input data:
```
[01/Aug/1995:00:54:59 -0400] "GET /images/opf-logo.gif HTTP/1.0" 200 32511
[01/Aug/1995:00:55:04 -0400] "GET /images/ksclogosmall.gif HTTP/1.0" 200 3635
[01/Aug/1995:00:55:06 -0400] "GET /images/ksclogosmall.gif HTTP/1.0" 403 298
[01/Aug/1995:00:55:09 -0400] "GET /images/ksclogosmall.gif HTTP/1.0" 200 3635
[01/Aug/1995:00:55:18 -0400] "GET /images/opf-logo.gif HTTP/1.0" 200 32511
[01/Aug/1995:00:56:52 -0400] "GET /images/ksclogosmall.gif HTTP/1.0" 200 3635
```

The result should be:
```
/images/ksclogosmall.gif 10905
/images/opf-logo.gif 65022
```
"""
import re
import sys
from collections import Counter


__author__ = 'Michael Lazar'
__email__ = 'lazar.michael22@gmail.com'
__license__ = 'The MIT License (MIT)'


# This regex is borrowed from another project that I recently worked on
# https://github.com/michael-lazar/Akita/blob/master/akita/parser.py
_LOG_PARTS = [
    r'\[(?P<time>.+)\]',  # time %t
    r'\"(?P<request>.*)"',  # request "%r"
    r'(?P<status>[0-9]+)',  # status %>s
    r'(?P<size>\S+)',  # size %b (careful, can be '-')
]
LOG_PATTERN = re.compile('\s+'.join(_LOG_PARTS) + r'\s*\Z')


def main(logfile):

    counter = Counter()
    with open(logfile) as fp:
        for line in fp:
            match = LOG_PATTERN.match(line)
            if not match:
                # Invalid format
                continue

            data = match.groupdict()
            if not data['status'].startswith('2'):
                # We only care about 2XX responses
                continue

            # TODO: Add error handling for invalid lines
            method, path, version = data['request'].split()
            counter[path] += int(data['size'])

    for path, val in counter.most_common(10):
        print(path, val)


if __name__ == '__main__':

    if len(sys.argv) != 2:
        print('Usage: {0} FILE'.format(__file__))
        sys.exit(-1)

    main(sys.argv[1])
	#!/usr/bin/env python3
	"""
	Write a script that, given a web server log file, returns the 10 most
	frequently requested objects and their cumulative bytes transferred.
	Only include GET requests with Successful (HTTP 2xx) responses.
	Resolve ties however you’d like.

	Log format:
	- request date, time, and time zone
	- request line from the client
	- HTTP status code returned to the client
	- size (in bytes) of the returned object

	Given this input data:
	```
	[01/Aug/1995:00:54:59 -0400] "GET /images/opf-logo.gif HTTP/1.0" 200 32511
	[01/Aug/1995:00:55:04 -0400] "GET /images/ksclogosmall.gif HTTP/1.0" 200 3635
	[01/Aug/1995:00:55:06 -0400] "GET /images/ksclogosmall.gif HTTP/1.0" 403 298
	[01/Aug/1995:00:55:09 -0400] "GET /images/ksclogosmall.gif HTTP/1.0" 200 3635
	[01/Aug/1995:00:55:18 -0400] "GET /images/opf-logo.gif HTTP/1.0" 200 32511
	[01/Aug/1995:00:56:52 -0400] "GET /images/ksclogosmall.gif HTTP/1.0" 200 3635
	```

	The result should be:
	```
	/images/ksclogosmall.gif 10905
	/images/opf-logo.gif 65022
	```
	"""
	import re
	import sys
	from collections import Counter


	__author__ = 'Michael Lazar'
	__email__ = 'lazar.michael22@gmail.com'
	__license__ = 'The MIT License (MIT)'


	# This regex is borrowed from another project that I recently worked on
	# https://github.com/michael-lazar/Akita/blob/master/akita/parser.py
	_LOG_PARTS = [
	r'\[(?P<time>.+)\]', # time %t
	r'\"(?P<request>.*)"', # request "%r"
	r'(?P<status>[0-9]+)', # status %>s
	r'(?P<size>\S+)', # size %b (careful, can be '-')
	]
	LOG_PATTERN = re.compile('\s+'.join(_LOG_PARTS) + r'\s*\Z')


	def main(logfile):

	counter = Counter()
	with open(logfile) as fp:
	for line in fp:
	match = LOG_PATTERN.match(line)
	if not match:
	# Invalid format
	continue

	data = match.groupdict()
	if not data['status'].startswith('2'):
	# We only care about 2XX responses
	continue

	# TODO: Add error handling for invalid lines
	method, path, version = data['request'].split()
	counter[path] += int(data['size'])

	for path, val in counter.most_common(10):
	print(path, val)


	if __name__ == '__main__':

	if len(sys.argv) != 2:
	print('Usage: {0} FILE'.format(__file__))
	sys.exit(-1)

	main(sys.argv[1])