Skip to content

Instantly share code, notes, and snippets.

@starenka
Last active November 25, 2016 20:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save starenka/650925f51a119e270a986d8e9fc352fc to your computer and use it in GitHub Desktop.
Save starenka/650925f51a119e270a986d8e9fc352fc to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding=utf-8
import argparse
import re
import logging
import datetime
import itertools
import pandas as pd
RE_LINE = re.compile(
r'^(?P<ip>.*) - - \[(?P<stamp>.*)\] "(?P<request>.*)" (?P<http_code>\d+) (?P<resp_bytes>\d+) "(?P<referer>.*)" "(?P<ua>.*)"$')
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)
def parse_file(fpath, drop_fields=None):
drop_fields = drop_fields or []
log.info('Parsing %s' % fpath)
with open(fpath) as f:
for line in f:
match = re.match(RE_LINE, line[:line.rindex('"')+1])
parsed = match.groupdict()
parsed['stamp'] = parsed['stamp'].split(' ')[0]
parsed['http_code'] = int(parsed['http_code'])
for field in drop_fields:
del parsed[field]
yield parsed
def parse_files(files, drop_fields=None):
return pd.DataFrame.from_records(itertools.chain.from_iterable(parse_file(lfile, drop_fields) for lfile in files))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='parses common access log format to pandas df')
parser.add_argument('logfiles', metavar='N', nargs='+', help='logfile(s)')
parser.add_argument('--ips', metavar='N', nargs='*', help='just this IPs')
parser.add_argument('-d', '--days-back', dest='days_back', type=int, default=1, help='lines newer then n days')
args = parser.parse_args()
df = parse_files(args.logfiles, drop_fields=('request', 'resp_bytes', 'referer'))
df['stamp'] = pd.to_datetime(df['stamp'], format='%d/%b/%Y:%H:%M:%S')
if args.days_back:
log.info('Filtering by date')
dfrom = datetime.date.today() - datetime.timedelta(days=args.days_back)
df = df.loc[(df['stamp'] > dfrom)]
if args.ips:
log.info('Filtering IPs')
df = df[df['ip'].isin(args.ips)]
by_ip_code = df.groupby(['ip', 'http_code'])
by_ip_ua = df.groupby(['ip', 'ua'])
# by_ip_ua.sizes()['ip.ad.d.r']
import ipdb
ipdb.set_trace()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment