Skip to content

Instantly share code, notes, and snippets.

@arrdem
Created June 1, 2019 04:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arrdem/27aa6eadefb6d0538c189755546c5ccc to your computer and use it in GitHub Desktop.
Save arrdem/27aa6eadefb6d0538c189755546c5ccc to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""Evil monitoring.
Ping hosts, syslogging at INFO if they're up and happy, otherwise using Telnet scripting to force
reset them and syslogging at CRIT with what the uptime was prior to forced reboot.
Hosts are debounced, so that they have a chance to return before monitoring resumes.
No effort is made to detect network conditions or poweroffs.
"""
from os import kill, getpid
import multiprocessing
import subprocess
import syslog
from datetime import datetime, timedelta
from time import sleep
import signal
from sys import exit
from telnetlib import Telnet
CONFIG = {
# APC PDU credentials
"pdu_username": "REDACTED",
"pdu_password": "REDACTED",
# Hosts recover in about 40s,
# But only stop responding to pings for about 6-8s.
"debounce": 40,
# Once a host is up, 5s of no ping is indicative.
"threshold": 5,
# (hostname: PDU port) pairs
"hosts": {
"logos": "2",
"ethos": "3",
"pathos": "4",
# "ketos": "5",
}
}
def log(level, msg):
print("{} @ {}] {}".format(getpid(), level, msg))
syslog.syslog(level, msg)
def zdec(i: int):
"""Decrement, stopping at 0."""
if i <= 1:
return 0
else:
return i - 1
def ping(hostname: str,
count: int = 2,
timeout: int = 1):
"""Send count packets to a hostname, with a timeout of timeout"""
try:
return subprocess.check_call(["ping", "-q", "-c", str(count), "-W", str(timeout), hostname],
stderr=subprocess.DEVNULL,
stdout=subprocess.DEVNULL) == 0
except subprocess.CalledProcessError:
return False
__reboot_lock__ = multiprocessing.Lock()
def do_reboot(port: str):
"""Get a shared lock, telnet to sucker, reset the port and log out."""
def l(text):
return (text + "\r").encode("utf-8")
def apc_login(conn):
conn.read_until(b"User Name")
conn.write(l(CONFIG['pdu_username']))
conn.read_until(b"Password")
conn.write(l(CONFIG['pdu_password']))
def apc_command(conn, cmd):
conn.read_until(b"APC>")
conn.write(l(cmd))
# To ensure only one process logs into the PDU at once
with __reboot_lock__:
conn = Telnet('sucker', 23)
apc_login(conn)
apc_command(conn, "reboot " + port)
apc_command(conn, "quit")
conn.close()
def monitor(hostname: str, port: str):
# Set a signal handler for shutdown
def _sigint(_signum, _frame):
print("monitor for {hostname} shutting down...".format(hostname=hostname))
exit(0)
signal.signal(signal.SIGINT, _sigint)
# Do the work
log(syslog.LOG_INFO, "Monitoring {hostname}".format(hostname=hostname))
threshold = CONFIG["threshold"]
debounce = timedelta(seconds=CONFIG["debounce"])
# Outer loop - never exits just restores state
while True:
start = datetime.today()
counter = 0
# Inner loop - a single monitoring round terminated in a restart
while True:
now = datetime.today()
delta = now - start
if delta < debounce:
pass
elif counter >= threshold:
# Bounce the box, wait for it to become healthy again
uptime = delta.total_seconds() - counter
log(syslog.LOG_ALERT, "{hostname} detected unhealthy for {counter}s after {uptime}s up, forcing reboot!".format(**locals()))
do_reboot(port)
# Break into the outer loop, resetting state
break
elif not ping(hostname):
# If the hostname is unhealthy, we increment its "bad" score
log(syslog.LOG_WARNING, "{hostname} detected unhealthy ({counter} of {threshold})".format(**locals()))
counter += 1
else:
# Otherwise we zdec the score.
counter = zdec(counter)
# delta > debounce implied by if ordering
if delta.total_seconds() % (60 * 5) // 1 == 0:
log(syslog.LOG_INFO, "{} healthy for {}s".format(hostname, delta.total_seconds()))
sleep(1)
if __name__ == "__main__":
processes = []
def stop_processes(_signum: int, _frame):
for p in processes:
kill(p.pid, signal.SIGINT)
for p in processes:
p.join()
signal.signal(signal.SIGINT, stop_processes)
for hostname, port in CONFIG["hosts"].items():
p = multiprocessing.Process(target=monitor, args=(hostname, port))
processes.append(p)
p.start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment