Created
February 27, 2012 23:36
-
-
Save dchaplinsky/1927921 to your computer and use it in GitHub Desktop.
sysinfo plugin for PyMunin to parse last records in nginx/apache logs and gather stats about pageload time and amount of visits generated by search engine robots
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Implements HttpdLog Class for gathering useful info from nginx/apache access logs. | |
| """ | |
| import re | |
| from datetime import timedelta, datetime | |
| import subprocess | |
| __author__ = "Dmitry Chaplinsky" | |
| __copyright__ = "Copyright 2012, Dmitry Chaplinsky" | |
| __credits__ = [] | |
| __license__ = "GPL" | |
| __version__ = "0.1" | |
| __maintainer__ = "Dmitry Chaplinsky" | |
| __email__ = "chaplinsky dot dmitry at gmail.com" | |
| __status__ = "Development" | |
| # Defaults | |
| tailCmd = 'tail' | |
| class HttpdLog: | |
| """Class to retrieve stats from access logs.""" | |
| def __init__(self, log=None, ua_to_catch=None, | |
| render_time_regex=None, | |
| record_time_regex="\[([^\]]*)\]", | |
| tail_records=1000, | |
| last_minutes=5): | |
| """Init HttpdLog | |
| @param log: Log file to read | |
| @param ua_to_catch: Dict with useragent string chunks to | |
| calculate visits from various bots | |
| @param render_time_regex Regex to parse page render time from log record | |
| @param record_time_regex Regex to parse time stamp in the log record | |
| @param tail_records Parse only last N records to gather info | |
| @param last_minutes Gather stats only for last N minutes | |
| """ | |
| # TODO: add some validation | |
| self._log = log | |
| self._ua_to_catch = ua_to_catch | |
| self._render_time_regex = render_time_regex | |
| self._record_time_regex = record_time_regex | |
| self._tail_records = tail_records | |
| self._last_minutes = timedelta(minutes = last_minutes) | |
| def _parseDateTimeWithTZ(self, d): | |
| try: | |
| offset = int(d[-5:]) | |
| except: | |
| print "Error" | |
| delta = timedelta(hours = offset / 100) | |
| return datetime.strptime(d[:-6], "%d/%b/%Y:%H:%M:%S") - delta | |
| def getData(self): | |
| """Returns list of filesystems. | |
| @return: List of filesystems. | |
| """ | |
| try: | |
| out = subprocess.Popen(["env", | |
| tailCmd, | |
| "-%d" % self._tail_records, | |
| self._log], | |
| stdout=subprocess.PIPE).communicate()[0] | |
| except OSError: | |
| raise Exception('Execution of command %s failed.' % tailCmd) | |
| render_time_sum = 0.0 | |
| render_time_count = 0 | |
| useragents = {} | |
| skip_datetime_check = False | |
| current_time = datetime.utcnow() | |
| if isinstance(self._ua_to_catch, dict): | |
| for ua in self._ua_to_catch.keys(): | |
| useragents[ua] = 0 | |
| for line in out.splitlines(): | |
| if not skip_datetime_check: | |
| matches = re.search(self._record_time_regex, line) | |
| if not matches: | |
| continue | |
| record_datetime = self._parseDateTimeWithTZ(matches.group(1)) | |
| if (current_time - record_datetime) <= self._last_minutes: | |
| skip_datetime_check = True | |
| else: | |
| continue | |
| if useragents: | |
| for ua, ua_token in self._ua_to_catch.iteritems(): | |
| if ua_token in line: | |
| useragents[ua] += 1 | |
| if self._render_time_regex: | |
| matches = re.search(self._render_time_regex, line) | |
| if matches: | |
| render_time_count += 1 | |
| render_time_sum += float(matches.group(1)) | |
| data = {} | |
| if useragents: | |
| data["useragents"] = useragents | |
| if render_time_count: | |
| data["page_load"] = render_time_sum / render_time_count | |
| return data |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
usage example: