Skip to content

Instantly share code, notes, and snippets.



Last active Nov 25, 2016
What would you like to do?
#!/usr/bin/env python
# coding=utf-8
import argparse
import re
import logging
import datetime
import itertools
import pandas as pd
RE_LINE = re.compile(
r'^(?P<ip>.*) - - \[(?P<stamp>.*)\] "(?P<request>.*)" (?P<http_code>\d+) (?P<resp_bytes>\d+) "(?P<referer>.*)" "(?P<ua>.*)"$')
log = logging.getLogger(__name__)
def parse_file(fpath, drop_fields=None):
drop_fields = drop_fields or []'Parsing %s' % fpath)
with open(fpath) as f:
for line in f:
match = re.match(RE_LINE, line[:line.rindex('"')+1])
parsed = match.groupdict()
parsed['stamp'] = parsed['stamp'].split(' ')[0]
parsed['http_code'] = int(parsed['http_code'])
for field in drop_fields:
del parsed[field]
yield parsed
def parse_files(files, drop_fields=None):
return pd.DataFrame.from_records(itertools.chain.from_iterable(parse_file(lfile, drop_fields) for lfile in files))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='parses common access log format to pandas df')
parser.add_argument('logfiles', metavar='N', nargs='+', help='logfile(s)')
parser.add_argument('--ips', metavar='N', nargs='*', help='just this IPs')
parser.add_argument('-d', '--days-back', dest='days_back', type=int, default=1, help='lines newer then n days')
args = parser.parse_args()
df = parse_files(args.logfiles, drop_fields=('request', 'resp_bytes', 'referer'))
df['stamp'] = pd.to_datetime(df['stamp'], format='%d/%b/%Y:%H:%M:%S')
if args.days_back:'Filtering by date')
dfrom = - datetime.timedelta(days=args.days_back)
df = df.loc[(df['stamp'] > dfrom)]
if args.ips:'Filtering IPs')
df = df[df['ip'].isin(args.ips)]
by_ip_code = df.groupby(['ip', 'http_code'])
by_ip_ua = df.groupby(['ip', 'ua'])
# by_ip_ua.sizes()['']
import ipdb
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.