Skip to content

Instantly share code, notes, and snippets.

@siddhesh
Last active October 11, 2019 19:08
Show Gist options
  • Save siddhesh/3db65dc06f2fe003e115ca4dcd873dc0 to your computer and use it in GitHub Desktop.
Save siddhesh/3db65dc06f2fe003e115ca4dcd873dc0 to your computer and use it in GitHub Desktop.
Read one or more httpd access.log files and print statistics I care about.
# Simple script to read httpd access.log and print a couple of statistics I care
# about. Usage:
#
# python3 process_accesslog.py <one or more log files>
#
# Copyright (c) 2019 Siddhesh Poyarekar
#
# This code is released under the MIT license:
# http://www.opensource.org/licenses/mit-license.php
import re
import sys
import pandas as pd
data = []
import argparse
parser = argparse.ArgumentParser(description='Parse httpd access.log files.')
parser.add_argument('files', nargs='+',
help='One or more files to parse')
parser.add_argument('--top', type=int, default=10,
help='Limit results to this number')
args = parser.parse_args()
for f in args.files:
with open(f) as fd:
lines = fd.readlines()
for l in lines:
fields = l.split(' ')
page = fields[6]
if '/posts/' not in page and '.pdf' not in page:
continue
date = re.sub(r'\[(\d+)/(\w+)/(\d+):.*', r'\1-\2-\3', fields[3])
referrer = fields[10].strip('"')[:100]
if referrer == '_REFERRER_':
referrer = '-'
data.append((date, page, referrer))
pd.options.display.float_format = '{:,}'.format
df = pd.DataFrame(data)
df[0].name = 'Total Hits'
df[1].name = 'Top Pages'
df[2].name = 'Top Referrers'
print('Total Hits: %s' % '{:,}'.format(len(data)))
print('+' * 80)
for i in [0, 1, 2]:
print('\n%s' % df[i].name)
print('-' * len(df[i].name))
tab = df[i].value_counts().apply(lambda x: "{:,}".format(x))
print(tab.head(args.top).to_string())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment