Last active
April 20, 2016 16:40
-
-
Save socalal/2fb27ff099bf338565a6ae4d2109fa73 to your computer and use it in GitHub Desktop.
Check 3 minute load average and email alert if threshold exceeded.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# Created by Alan M. Orther - alan.orther@gmail.com | |
# This script is used to monitor a server's load and email an alert if the | |
# threshold is met or exceeded over a 3 minute average. | |
import os | |
import sys | |
import getopt | |
import multiprocessing | |
from time import sleep | |
# Get the number of CPU cores for default load if no value entered. | |
cores = multiprocessing.cpu_count() | |
# Store the commandline arguments and default script values. | |
monitor_time = 15 | |
monitor_time_default = 15 | |
load_threshold = cores | |
load_threshold_default = cores | |
emailaddress = "" | |
script_time = 0 | |
# The general error message and then exit script. | |
def error_message(): | |
print("\nUsage: %s -e <email address ot send alert to> -t <# in minutes> " | |
"-l <load alert threshold - numeric value> \n" % sys.argv[0]) | |
print "AN EMAIL ADDRESS IS THE ONLY REQUIRED INPUT. Default " \ | |
"monitoring time is %s minutes and default load threshold is the " \ | |
"numbers of CPU cores on the system.\n" % monitor_time_default | |
quit() | |
# The help section. | |
def help_me(): | |
print """\n | |
HELP SECTION | |
Version 0.003 | |
USAGE: check_load.py -e test@email -t 30 -l 2.5 | |
<EMAIL IS THE ONLY REQUIRED ARGUMENT> | |
This script is designed to check the load average every 15 seconds until | |
the time designated by the user. The only required argument is the email | |
address to send the alert if the load is equal to or higher than the | |
script's default load threshold (system's number of cores) or the user's | |
defined threshold. | |
-e Email address to receive alert. ONLY REQUIRED ARGUMENT | |
-t Time to run script in minutes. Whole number (Default is 15 minutes) | |
-l Load threshold average after 3 minutes to send alert. | |
Load can be a floating point number. (Default is # of cores) | |
""" | |
quit() | |
# Defines the arguments that bring you to the help section. | |
if len(sys.argv) <= 1: | |
help_me() | |
elif sys.argv[1] in ['-h', '-help', '--help', '-halp', '-howdoesthiswork']: | |
help_me() | |
# Sets the usable arguments and prints errors if used incorrectly. | |
try: | |
opt, args = getopt.getopt(sys.argv[1:], "t:l:e:") | |
except getopt.GetoptError as error: | |
print "\n -= " + (str(error)) + " =-" | |
error_message() | |
# Define the options and the arguments. | |
try: | |
for o, a in opt: | |
if o == '-t': | |
monitor_time = int(a) | |
elif o == '-l': | |
load_threshold = float(a) | |
elif o == '-e': | |
emailaddress = a | |
except: | |
print "\n -= ERROR with your numeric values =-" | |
print " -t must be a whole number and -l can be a floating point number" | |
help_me() | |
# Confirm that the email argument is not blank. | |
if emailaddress == "": | |
error_message() | |
# Confirm that the email argument has some email elements. | |
if "@" not in emailaddress or "." not in emailaddress: | |
print "\n NOT A VALID EMAIL ADDRESS" | |
error_message() | |
# Confirm that the load and monitoring time are correct time formats. | |
if monitor_time < 3 or load_threshold <= 0: | |
print "\nLoad threshold must not be negative or zero and monitoring time " \ | |
"must not be a value below 3 minutes." | |
error_message() | |
# Show the user what options will be used. | |
print "\nMonitoring time: %s minutes" % monitor_time + " (%s minutes is the " \ | |
"script default)" % monitor_time_default | |
print "Load threshold: %s" % load_threshold + " (%s is the script default " \ | |
"because there are %s cores on this system)" % \ | |
(load_threshold_default, | |
load_threshold_default) | |
print "Alert email address: %s\n" % emailaddress | |
# Create an empty list for the load values | |
load_list = [] | |
# Modify the monitor time so the system can check every 15 seconds. | |
monitor_time_modified = monitor_time * 4 | |
# While the script's time is less than the monitor time run loop. | |
while script_time <= monitor_time_modified: | |
print load_list | |
# Count load_list and pass if there are not enough measurements. | |
if len(load_list) < 12: | |
print len(load_list) | |
pass | |
# Take the last 12 values (3 min), average them, and compare to the | |
# load threshold. | |
else: | |
last_values = load_list[-12:] | |
average = (sum(last_values)/12) | |
print average | |
print load_threshold | |
# If the 3 min average is over the threshold, send the email alert | |
# and quit. | |
if average >= load_threshold: | |
email_message = "Your server has had a load count of %4.2f " \ | |
"for an average of 3 minutes! Your threshold is %s" \ | |
% (average, load_threshold) | |
os_command = 'echo "%s" | /bin/mail -s "LOAD ALERT" "%s"' % \ | |
(email_message, emailaddress) | |
os.system(os_command) | |
print "LOAD ALARM SENT" | |
quit() | |
# Get the 1 min load average. | |
load = float(os.getloadavg()[0]) | |
load_list.append(load) | |
# Sleep 15 seconds and increment script timer. | |
sleep(1) | |
script_time += 1 | |
# Output if the load doesn't exceed the load threshold. | |
print "The load looks good. The 3 minute average was less than the defined " \ | |
"threshold that was set to %s.\n" % load_threshold | |
quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment