dchaplinsky/httpdlogs.py

## httpdlogs.py
"""Implements HttpdLog Class for gathering useful info from nginx/apache access logs.

"""
import re
from datetime import timedelta, datetime
import subprocess

__author__ = "Dmitry Chaplinsky"
__copyright__ = "Copyright 2012, Dmitry Chaplinsky"
__credits__ = []
__license__ = "GPL"
__version__ = "0.1"
__maintainer__ = "Dmitry Chaplinsky"
__email__ = "chaplinsky dot dmitry at gmail.com"
__status__ = "Development"


# Defaults
tailCmd = 'tail'

class HttpdLog:
    """Class to retrieve stats from access logs."""

    def __init__(self, log=None, ua_to_catch=None,
                 render_time_regex=None,
                 record_time_regex="\[([^\]]*)\]",
                 tail_records=1000,
                 last_minutes=5):
        """Init HttpdLog
        @param log:                 Log file to read
        @param ua_to_catch:         Dict with useragent string chunks to
                                    calculate visits from various bots
        @param render_time_regex    Regex to parse page render time from log record
        @param record_time_regex    Regex to parse time stamp in the log record
        @param tail_records         Parse only last N records to gather info
        @param last_minutes         Gather stats only for last N minutes
        """

        # TODO: add some validation
        self._log = log
        self._ua_to_catch = ua_to_catch
        self._render_time_regex = render_time_regex
        self._record_time_regex = record_time_regex
        self._tail_records = tail_records
        self._last_minutes = timedelta(minutes = last_minutes)

    def _parseDateTimeWithTZ(self, d):

        try:
            offset = int(d[-5:])
        except:
            print "Error"

        delta = timedelta(hours = offset / 100)

        return datetime.strptime(d[:-6], "%d/%b/%Y:%H:%M:%S") - delta

    def getData(self):
        """Returns list of filesystems.

        @return: List of filesystems.

        """
        try:
            out = subprocess.Popen(["env",
                                        tailCmd,
                                        "-%d" % self._tail_records,
                                        self._log],
                                   stdout=subprocess.PIPE).communicate()[0]
        except OSError:
            raise Exception('Execution of command %s failed.' % tailCmd)

        render_time_sum = 0.0
        render_time_count = 0
        useragents = {}
        skip_datetime_check = False
        current_time = datetime.utcnow()


        if isinstance(self._ua_to_catch, dict):
            for ua in self._ua_to_catch.keys():
                useragents[ua] = 0

        for line in out.splitlines():
            if not skip_datetime_check:
                matches = re.search(self._record_time_regex, line)

                if not matches:
                    continue

                record_datetime = self._parseDateTimeWithTZ(matches.group(1))

                if (current_time - record_datetime) <= self._last_minutes:
                    skip_datetime_check = True
                else:
                    continue

            if useragents:
                for ua, ua_token in self._ua_to_catch.iteritems():
                    if ua_token in line:
                        useragents[ua] += 1

            if self._render_time_regex:
                matches = re.search(self._render_time_regex, line)

                if matches:
                    render_time_count += 1
                    render_time_sum += float(matches.group(1))

        data = {}

        if useragents:
            data["useragents"] = useragents

        if render_time_count:
            data["page_load"] = render_time_sum / render_time_count

        return data
	"""Implements HttpdLog Class for gathering useful info from nginx/apache access logs.

	"""
	import re
	from datetime import timedelta, datetime
	import subprocess

	__author__ = "Dmitry Chaplinsky"
	__copyright__ = "Copyright 2012, Dmitry Chaplinsky"
	__credits__ = []
	__license__ = "GPL"
	__version__ = "0.1"
	__maintainer__ = "Dmitry Chaplinsky"
	__email__ = "chaplinsky dot dmitry at gmail.com"
	__status__ = "Development"


	# Defaults
	tailCmd = 'tail'

	class HttpdLog:
	"""Class to retrieve stats from access logs."""

	def __init__(self, log=None, ua_to_catch=None,
	render_time_regex=None,
	record_time_regex="\[([^\]]*)\]",
	tail_records=1000,
	last_minutes=5):
	"""Init HttpdLog
	@param log: Log file to read
	@param ua_to_catch: Dict with useragent string chunks to
	calculate visits from various bots
	@param render_time_regex Regex to parse page render time from log record
	@param record_time_regex Regex to parse time stamp in the log record
	@param tail_records Parse only last N records to gather info
	@param last_minutes Gather stats only for last N minutes
	"""

	# TODO: add some validation
	self._log = log
	self._ua_to_catch = ua_to_catch
	self._render_time_regex = render_time_regex
	self._record_time_regex = record_time_regex
	self._tail_records = tail_records
	self._last_minutes = timedelta(minutes = last_minutes)

	def _parseDateTimeWithTZ(self, d):

	try:
	offset = int(d[-5:])
	except:
	print "Error"

	delta = timedelta(hours = offset / 100)

	return datetime.strptime(d[:-6], "%d/%b/%Y:%H:%M:%S") - delta

	def getData(self):
	"""Returns list of filesystems.

	@return: List of filesystems.

	"""
	try:
	out = subprocess.Popen(["env",
	tailCmd,
	"-%d" % self._tail_records,
	self._log],
	stdout=subprocess.PIPE).communicate()[0]
	except OSError:
	raise Exception('Execution of command %s failed.' % tailCmd)

	render_time_sum = 0.0
	render_time_count = 0
	useragents = {}
	skip_datetime_check = False
	current_time = datetime.utcnow()


	if isinstance(self._ua_to_catch, dict):
	for ua in self._ua_to_catch.keys():
	useragents[ua] = 0

	for line in out.splitlines():
	if not skip_datetime_check:
	matches = re.search(self._record_time_regex, line)

	if not matches:
	continue

	record_datetime = self._parseDateTimeWithTZ(matches.group(1))

	if (current_time - record_datetime) <= self._last_minutes:
	skip_datetime_check = True
	else:
	continue

	if useragents:
	for ua, ua_token in self._ua_to_catch.iteritems():
	if ua_token in line:
	useragents[ua] += 1

	if self._render_time_regex:
	matches = re.search(self._render_time_regex, line)

	if matches:
	render_time_count += 1
	render_time_sum += float(matches.group(1))

	data = {}

	if useragents:
	data["useragents"] = useragents

	if render_time_count:
	data["page_load"] = render_time_sum / render_time_count

	return data
No results found