Skip to content

Instantly share code, notes, and snippets.

@daniyalzade
Created November 13, 2012 17:39
Show Gist options
  • Save daniyalzade/4067207 to your computer and use it in GitHub Desktop.
Save daniyalzade/4067207 to your computer and use it in GitHub Desktop.
Nagios Plugin For Checking Uptime in a Cluster (over multiple hosts)
#!/usr/bin/env python
# Script to enure that the cluster defined by the hostgroup, or list
# hosts has enough healthy nodes as defined by the 'check_ping' plugin.
# If it has more than N number of unhealthy nodes, the plugin returns
# the appropriate error.
#
# To get the list of hosts from a hostgroup, this script parses nagios'
# hosts.cfg file which has the following format:
#
# ...
# define host {
# host_name host01.chartbeat.com
# address 192.168.0.1
# use hostgroupname
# }
# ...
# requirements:
# envoy
# tornado - for option parsing
from collections import defaultdict
import envoy
import logging
import re
from tornado.options import define, options, parse_command_line
OK=0
WARNING=1
CRITICAL=2
UNKNOWN=3
HOST_PATTERN = re.compile(r'host_name\s*(\S+)\s*')
USE_PATTERN = re.compile(r'use\s*(\S+)\s*')
def _check_host(check_ping, host, ping_warning, ping_critical):
"""
@param check_ping: str, path to check_ping bin
@param host: str
@param ping_warning: str
@param ping_critical: str
@return: (status_code, output)
"""
cmd = "%s %s -w %s -c %s" % (check_ping, host, ping_warning, ping_critical)
logging.info("running command '%s'" % cmd)
try:
result = envoy.run(cmd)
logging.info("status_code: '%s', std_out: '%s'" % (result.status_code, result.std_out))
return (result.status_code, result.std_out)
except Exception:
msg = "could not run command '%s'" % cmd
logging.warning(msg, exc_info=True)
return (CRITICAL, "could not run command '%s'" % cmd)
def _parse_hosts_file(content):
"""
Parse the Nagios hosts.cfg file and return a hostgroup -> hosts lookup
dict.
@param content: file
@return: dict(str(hostgroup), list(str(host)))
"""
hostname = None
use = None
hostgroups = defaultdict(list)
for line in content.split('\n'):
line = line.strip()
m = re.search(HOST_PATTERN, line)
if m:
hostname = m.group(1)
m = re.search(USE_PATTERN, line)
if m:
use = m.group(1)
if hostname and use:
hostgroups[use].append(hostname)
hostname = None
use = None
return hostgroups
def _main_helper(hosts_cfg, warning, critical, ping_warning, ping_critical,
check_ping=None,
hosts=None,
hostgroup=None,
):
"""
@param hosts_cfg: path
@param warning: int
@param critical: int
@param ping_warning: str
@param ping_critical: str
@param check_ping: str
@param hosts: list
@param hostsgroup: str
"""
if hosts:
hosts = hosts
else:
hosts_file = open(hosts_cfg).read()
hostgroups = _parse_hosts_file(hosts_file)
hosts = []
for hostgroup in hostgroup.split(','):
cur_hosts = hostgroups[hostgroup]
hosts.extend(cur_hosts)
if not hosts:
msg = ("WARNING - no hostgroup '%s'. Known hostgroups: %s" %
(hostgroup, hostgroups.keys()))
logging.info(msg)
print msg
return WARNING
logging.info("running checks on hosts %s, warning %s, critical %s" %
(hosts, warning, critical))
errors = []
for host in hosts:
(status_code, std_out) = _check_host(
check_ping,
host,
ping_warning,
ping_critical,
)
if status_code:
errors.append((host, status_code, std_out))
err_msg = '|| '.join([str(e) for e in errors])
if len(errors) >= critical:
msg = ("ERROR - hostgroup '%s' [hosts '%s'] failing. Errors: %s, Num Errors %s/%s" %
(hostgroup, hosts, err_msg, len(errors), critical))
logging.info(msg)
print msg
return CRITICAL
if len(errors) >= warning:
msg = ("WARNING - hostgroup '%s' [hosts '%s'] failing. Errors: %s, Num Errors %s/%s" %
(hostgroup, hosts, err_msg, len(errors), warning))
logging.info(msg)
print msg
return WARNING
msg = ("OK - hostgroup '%s' [hosts '%s']. Errors: %s, Num Errors %s" %
(hostgroup, hosts, err_msg, len(errors)))
logging.info(msg)
print msg
return OK
def main():
define('hosts_cfg',
help='absolute path to hosts cfg file',
default='/etc/nagios3/conf.d/servers/hosts.cfg',
)
define('check_ping',
help='absolute path to check_ping binary',
default='/usr/lib/nagios/plugins/check_ping',
)
define('warning',
help='threshold for number of hosts to be down before warning',
default=1,
type=int,
)
define('critical',
help='threshold for number of hosts to be down before critical',
default=2,
type=int,
)
define('ping_warning',
default='100.0,20%',
)
define('ping_critical',
default='500.0,60%',
)
define('hosts', multiple=True,
help='List of hosts. Mainly to be used for debugging'
)
define('hostgroup',
help='hostgroup to check for. If multiple hostgroups are to be added, make it a CSV'
)
parse_command_line()
if not options.hostgroup and not options.hosts:
msg = 'ERROR - either hostgroup or hosts should be passed'
logging.info(msg)
print msg
return CRITICAL
return _main_helper(options.hosts_cfg,
options.warning,
options.critical,
options.ping_warning,
options.ping_critical,
check_ping=options.check_ping,
hosts=options.hosts,
hostgroup=options.hostgroup,
)
if __name__ == '__main__':
try:
exit(main())
except Exception:
msg = 'interrupted/failed'
logging.exception(msg)
print msg
exit(UNKNOWN)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment