Skip to content

Instantly share code, notes, and snippets.

@ivanistheone
Created December 10, 2012 09:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ivanistheone/4249678 to your computer and use it in GitHub Desktop.
Save ivanistheone/4249678 to your computer and use it in GitHub Desktop.
Semi-finished version of visitor script
#!/usr/bin/env python
from collections import defaultdict, namedtuple
import datetime
from operator import itemgetter
import re
import requests
import json
import sys
# SERVER SIDE
########################################################################
#mv /var/log/nginx/mr.access*
#to /home/ivan/logs/miniref/ ---> adjust filenames, skip dups & set perms to ivan-readable
# STEP 0 get logs
########################################################################
#rsync to local dir
#and get list of logs from last N five days (defualt N=5)
filename= None
if len(sys.argv)> 1:
filename = sys.argv[1]
if not filename:
f = open("logs/access.log")
else:
f = open(filename)
logfiles_list = [ f.readlines() ]
# STEP 1 parse logs
########################################################################
format_pat= re.compile(
r"(?P<host>[\d\.]+)\s"
r"(?P<identity>\S*)\s"
r"(?P<user>\S*)\s"
r"\[(?P<time>.*?)\]\s"
r'"(?P<request>.*?)"\s'
r"(?P<status>\d+)\s"
r"(?P<bytes>\S*)\s"
r'"(?P<referer>.*?)"\s' # [SIC]
r'"(?P<user_agent>.*?)"\s*'
)
Access = namedtuple('Access',
['host', 'identity', 'user', 'time', 'request',
'status', 'bytes', 'referer', 'user_agent'] )
def access_iter( source_iter ):
for log in source_iter:
for line in (l.rstrip() for l in log):
match= format_pat.match(line)
if match:
yield Access( **match.groupdict() )
parsed_entries = access_iter( logfiles_list )
# STEP 2 normalize data
########################################################################
month_map = {'Jan': 1, 'Feb': 2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7,
'Aug':8, 'Sep': 9, 'Oct':10, 'Nov': 11, 'Dec': 12}
def cleanup_time(s):
"""Convert an apache log timestamp to a datetime object"""
return datetime.datetime(int(s[7:11]), month_map[s[3:6]], int(s[0:2]), \
int(s[12:14]), int(s[15:17]), int(s[18:20]))
def cleanup_entry( res ):
""" Get data out of the namedtuple, stuff into dict and normalize properties """
res_out = {}
if res.user == "-":
res_out["user"] = None
res_out["status"] = int(res.status)
if res.bytes == "-":
res_out["bytes"] = 0
else:
res_out["bytes"] = int(res.bytes)
res_out["host"] = res.host
res_out["identity"] = res.identity
res_out["time"] = cleanup_time( res.time )
res_out["request"] = res.request
res_out["referer"] = res.referer
if res.referer == "-":
res_out["referer"] = None
res_out["user_agent"] = res.user_agent
return res_out
entries = [ cleanup_entry(en) for en in parsed_entries ]
sorted_entries = sorted( entries, key=itemgetter("time") )
# STEP 3 Organize into sessions
########################################################################
visitors = defaultdict( list )
for req in sorted_entries:
ua = req["user_agent"]
if ( ("bingbot" in ua) or ("Baiduspider" in ua) or ( "WBSearchBot" in ua) or ("Googlebot" in ua) ):
continue
else:
visitors[ req["host"] ].append( req )
sorted_visitors = sorted( visitors.iteritems(), key=lambda p: p[1][0]["time"] )
# STEP 4 Print each request in sesssion
########################################################################
for tup in sorted_visitors:
vis_ip = tup[0]
vis_list = tup[1]
# collect geolocation info
r = requests.get("http://api.hostip.info/get_json.php?" + vis_ip)
j = json.loads( r.read() )
print vis_ip + " " + vis_list[0]["time"].__str__()
print "from " + j["city"] + ", " + j["country_name"] + " " + vis_list[0]["user_agent"]
old_time = vis_list[0]["time"]
current_req = vis_list[0]["request"]
for req in vis_list[1:]:
print " ", current_req, req["time"] - old_time
old_time = req["time"]
current_req = req["request"]
print " ", current_req, " end", "\n"
# TODO, skip media links & blog links in general
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment