Skip to content

Instantly share code, notes, and snippets.

@socalal
Last active April 20, 2016 16:40
Show Gist options
  • Save socalal/2fb27ff099bf338565a6ae4d2109fa73 to your computer and use it in GitHub Desktop.
Save socalal/2fb27ff099bf338565a6ae4d2109fa73 to your computer and use it in GitHub Desktop.
Check 3 minute load average and email alert if threshold exceeded.
#!/usr/bin/python
# Created by Alan M. Orther - alan.orther@gmail.com
# This script is used to monitor a server's load and email an alert if the
# threshold is met or exceeded over a 3 minute average.
import os
import sys
import getopt
import multiprocessing
from time import sleep
# Get the number of CPU cores for default load if no value entered.
cores = multiprocessing.cpu_count()
# Store the commandline arguments and default script values.
monitor_time = 15
monitor_time_default = 15
load_threshold = cores
load_threshold_default = cores
emailaddress = ""
script_time = 0
# The general error message and then exit script.
def error_message():
print("\nUsage: %s -e <email address ot send alert to> -t <# in minutes> "
"-l <load alert threshold - numeric value> \n" % sys.argv[0])
print "AN EMAIL ADDRESS IS THE ONLY REQUIRED INPUT. Default " \
"monitoring time is %s minutes and default load threshold is the " \
"numbers of CPU cores on the system.\n" % monitor_time_default
quit()
# The help section.
def help_me():
print """\n
HELP SECTION
Version 0.003
USAGE: check_load.py -e test@email -t 30 -l 2.5
<EMAIL IS THE ONLY REQUIRED ARGUMENT>
This script is designed to check the load average every 15 seconds until
the time designated by the user. The only required argument is the email
address to send the alert if the load is equal to or higher than the
script's default load threshold (system's number of cores) or the user's
defined threshold.
-e Email address to receive alert. ONLY REQUIRED ARGUMENT
-t Time to run script in minutes. Whole number (Default is 15 minutes)
-l Load threshold average after 3 minutes to send alert.
Load can be a floating point number. (Default is # of cores)
"""
quit()
# Defines the arguments that bring you to the help section.
if len(sys.argv) <= 1:
help_me()
elif sys.argv[1] in ['-h', '-help', '--help', '-halp', '-howdoesthiswork']:
help_me()
# Sets the usable arguments and prints errors if used incorrectly.
try:
opt, args = getopt.getopt(sys.argv[1:], "t:l:e:")
except getopt.GetoptError as error:
print "\n -= " + (str(error)) + " =-"
error_message()
# Define the options and the arguments.
try:
for o, a in opt:
if o == '-t':
monitor_time = int(a)
elif o == '-l':
load_threshold = float(a)
elif o == '-e':
emailaddress = a
except:
print "\n -= ERROR with your numeric values =-"
print " -t must be a whole number and -l can be a floating point number"
help_me()
# Confirm that the email argument is not blank.
if emailaddress == "":
error_message()
# Confirm that the email argument has some email elements.
if "@" not in emailaddress or "." not in emailaddress:
print "\n NOT A VALID EMAIL ADDRESS"
error_message()
# Confirm that the load and monitoring time are correct time formats.
if monitor_time < 3 or load_threshold <= 0:
print "\nLoad threshold must not be negative or zero and monitoring time " \
"must not be a value below 3 minutes."
error_message()
# Show the user what options will be used.
print "\nMonitoring time: %s minutes" % monitor_time + " (%s minutes is the " \
"script default)" % monitor_time_default
print "Load threshold: %s" % load_threshold + " (%s is the script default " \
"because there are %s cores on this system)" % \
(load_threshold_default,
load_threshold_default)
print "Alert email address: %s\n" % emailaddress
# Create an empty list for the load values
load_list = []
# Modify the monitor time so the system can check every 15 seconds.
monitor_time_modified = monitor_time * 4
# While the script's time is less than the monitor time run loop.
while script_time <= monitor_time_modified:
print load_list
# Count load_list and pass if there are not enough measurements.
if len(load_list) < 12:
print len(load_list)
pass
# Take the last 12 values (3 min), average them, and compare to the
# load threshold.
else:
last_values = load_list[-12:]
average = (sum(last_values)/12)
print average
print load_threshold
# If the 3 min average is over the threshold, send the email alert
# and quit.
if average >= load_threshold:
email_message = "Your server has had a load count of %4.2f " \
"for an average of 3 minutes! Your threshold is %s" \
% (average, load_threshold)
os_command = 'echo "%s" | /bin/mail -s "LOAD ALERT" "%s"' % \
(email_message, emailaddress)
os.system(os_command)
print "LOAD ALARM SENT"
quit()
# Get the 1 min load average.
load = float(os.getloadavg()[0])
load_list.append(load)
# Sleep 15 seconds and increment script timer.
sleep(1)
script_time += 1
# Output if the load doesn't exceed the load threshold.
print "The load looks good. The 3 minute average was less than the defined " \
"threshold that was set to %s.\n" % load_threshold
quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment