Skip to content

Instantly share code, notes, and snippets.

@ivanistheone
Created December 10, 2012 08:35
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ivanistheone/4249353 to your computer and use it in GitHub Desktop.
Save ivanistheone/4249353 to your computer and use it in GitHub Desktop.
Parsing access log and collecting sessions
#!/usr/bin/env python
import re
from collections import defaultdict, namedtuple
import datetime
f = open("access.log")
logfiles_list = [ f.readlines() ]
# STEP 1 parse logs
########################################################################
format_pat= re.compile(
r"(?P<host>[\d\.]+)\s"
r"(?P<identity>\S*)\s"
r"(?P<user>\S*)\s"
r"\[(?P<time>.*?)\]\s"
r'"(?P<request>.*?)"\s'
r"(?P<status>\d+)\s"
r"(?P<bytes>\S*)\s"
r'"(?P<referer>.*?)"\s' # [SIC]
r'"(?P<user_agent>.*?)"\s*'
)
Access = namedtuple('Access',
['host', 'identity', 'user', 'time', 'request',
'status', 'bytes', 'referer', 'user_agent'] )
def access_iter( source_iter ):
for log in source_iter:
for line in (l.rstrip() for l in log):
match= format_pat.match(line)
if match:
yield Access( **match.groupdict() )
parsed_entries = access_iter( logfiles_list )
# STEP 2 normalize data
########################################################################
month_map = {'Jan': 1, 'Feb': 2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7,
'Aug':8, 'Sep': 9, 'Oct':10, 'Nov': 11, 'Dec': 12}
def cleanup_time(s):
"""Convert an apache log timestamp to a datetime object"""
return datetime.datetime(int(s[7:11]), month_map[s[3:6]], int(s[0:2]), \
int(s[12:14]), int(s[15:17]), int(s[18:20]))
def cleanup_entry( res ):
""" Get data out of the namedtuple, stuff into dict and normalize properties """
res_out = {}
if res.user == "-":
res_out["user"] = None
res_out["status"] = int(res.status)
if res.bytes == "-":
res_out["bytes"] = 0
else:
res_out["bytes"] = int(res.bytes)
res_out["host"] = res.host
res_out["identity"] = res.identity
res_out["time"] = cleanup_time( res.time )
res_out["request"] = res.request
res_out["referer"] = res.referer
if res.referer == "-":
res_out["referer"] = None
res_out["user_agent"] = res.user_agent
return res_out
entries = [ cleanup_entry(en) for en in parsed_entries ]
#### EXAMPLE INPUT
In [29]: f.readlines()[10]
Out[29]: '66.249.76.98 - - [21/Nov/2012:08:07:15 -0800] "GET /_media/indexmenu/math HTTP/1.1" 404 40 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"\n'
#### EXAMPLE OUTPUT
In [22]: entries[10]
Out[22]:
{'bytes': 40,
'host': '66.249.76.98',
'identity': '-',
'referer': None,
'request': 'GET /_media/indexmenu/math HTTP/1.1',
'status': 404,
'time': datetime.datetime(2012, 11, 21, 8, 7, 15),
'user': None,
'user_agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment