Skip to content

Instantly share code, notes, and snippets.

@dchaplinsky
Created February 27, 2012 23:36
Show Gist options
  • Select an option

  • Save dchaplinsky/1927921 to your computer and use it in GitHub Desktop.

Select an option

Save dchaplinsky/1927921 to your computer and use it in GitHub Desktop.
sysinfo plugin for PyMunin to parse last records in nginx/apache logs and gather stats about pageload time and amount of visits generated by search engine robots
"""Implements HttpdLog Class for gathering useful info from nginx/apache access logs.
"""
import re
from datetime import timedelta, datetime
import subprocess
__author__ = "Dmitry Chaplinsky"
__copyright__ = "Copyright 2012, Dmitry Chaplinsky"
__credits__ = []
__license__ = "GPL"
__version__ = "0.1"
__maintainer__ = "Dmitry Chaplinsky"
__email__ = "chaplinsky dot dmitry at gmail.com"
__status__ = "Development"
# Defaults
tailCmd = 'tail'
class HttpdLog:
"""Class to retrieve stats from access logs."""
def __init__(self, log=None, ua_to_catch=None,
render_time_regex=None,
record_time_regex="\[([^\]]*)\]",
tail_records=1000,
last_minutes=5):
"""Init HttpdLog
@param log: Log file to read
@param ua_to_catch: Dict with useragent string chunks to
calculate visits from various bots
@param render_time_regex Regex to parse page render time from log record
@param record_time_regex Regex to parse time stamp in the log record
@param tail_records Parse only last N records to gather info
@param last_minutes Gather stats only for last N minutes
"""
# TODO: add some validation
self._log = log
self._ua_to_catch = ua_to_catch
self._render_time_regex = render_time_regex
self._record_time_regex = record_time_regex
self._tail_records = tail_records
self._last_minutes = timedelta(minutes = last_minutes)
def _parseDateTimeWithTZ(self, d):
try:
offset = int(d[-5:])
except:
print "Error"
delta = timedelta(hours = offset / 100)
return datetime.strptime(d[:-6], "%d/%b/%Y:%H:%M:%S") - delta
def getData(self):
"""Returns list of filesystems.
@return: List of filesystems.
"""
try:
out = subprocess.Popen(["env",
tailCmd,
"-%d" % self._tail_records,
self._log],
stdout=subprocess.PIPE).communicate()[0]
except OSError:
raise Exception('Execution of command %s failed.' % tailCmd)
render_time_sum = 0.0
render_time_count = 0
useragents = {}
skip_datetime_check = False
current_time = datetime.utcnow()
if isinstance(self._ua_to_catch, dict):
for ua in self._ua_to_catch.keys():
useragents[ua] = 0
for line in out.splitlines():
if not skip_datetime_check:
matches = re.search(self._record_time_regex, line)
if not matches:
continue
record_datetime = self._parseDateTimeWithTZ(matches.group(1))
if (current_time - record_datetime) <= self._last_minutes:
skip_datetime_check = True
else:
continue
if useragents:
for ua, ua_token in self._ua_to_catch.iteritems():
if ua_token in line:
useragents[ua] += 1
if self._render_time_regex:
matches = re.search(self._render_time_regex, line)
if matches:
render_time_count += 1
render_time_sum += float(matches.group(1))
data = {}
if useragents:
data["useragents"] = useragents
if render_time_count:
data["page_load"] = render_time_sum / render_time_count
return data
@dchaplinsky
Copy link
Author

usage example:

log = httpdlogs.HttpdLog("/foobar/access.log", tail_records = 10000, render_time_regex = 'rt\:\s(.*)$', ua_to_catch={"googlebot": "compatible; Googlebot", "baidu": "compatible; Baiduspider"}, last_minutes=120);
log.getData()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment