#!/usr/bin/env python | |
# coding=utf-8 | |
import argparse | |
import re | |
import logging | |
import datetime | |
import itertools | |
import pandas as pd | |
RE_LINE = re.compile( | |
r'^(?P<ip>.*) - - \[(?P<stamp>.*)\] "(?P<request>.*)" (?P<http_code>\d+) (?P<resp_bytes>\d+) "(?P<referer>.*)" "(?P<ua>.*)"$') | |
logging.basicConfig(level=logging.INFO) | |
log = logging.getLogger(__name__) | |
def parse_file(fpath, drop_fields=None): | |
drop_fields = drop_fields or [] | |
log.info('Parsing %s' % fpath) | |
with open(fpath) as f: | |
for line in f: | |
match = re.match(RE_LINE, line[:line.rindex('"')+1]) | |
parsed = match.groupdict() | |
parsed['stamp'] = parsed['stamp'].split(' ')[0] | |
parsed['http_code'] = int(parsed['http_code']) | |
for field in drop_fields: | |
del parsed[field] | |
yield parsed | |
def parse_files(files, drop_fields=None): | |
return pd.DataFrame.from_records(itertools.chain.from_iterable(parse_file(lfile, drop_fields) for lfile in files)) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='parses common access log format to pandas df') | |
parser.add_argument('logfiles', metavar='N', nargs='+', help='logfile(s)') | |
parser.add_argument('--ips', metavar='N', nargs='*', help='just this IPs') | |
parser.add_argument('-d', '--days-back', dest='days_back', type=int, default=1, help='lines newer then n days') | |
args = parser.parse_args() | |
df = parse_files(args.logfiles, drop_fields=('request', 'resp_bytes', 'referer')) | |
df['stamp'] = pd.to_datetime(df['stamp'], format='%d/%b/%Y:%H:%M:%S') | |
if args.days_back: | |
log.info('Filtering by date') | |
dfrom = datetime.date.today() - datetime.timedelta(days=args.days_back) | |
df = df.loc[(df['stamp'] > dfrom)] | |
if args.ips: | |
log.info('Filtering IPs') | |
df = df[df['ip'].isin(args.ips)] | |
by_ip_code = df.groupby(['ip', 'http_code']) | |
by_ip_ua = df.groupby(['ip', 'ua']) | |
# by_ip_ua.sizes()['ip.ad.d.r'] | |
import ipdb | |
ipdb.set_trace() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment