Skip to content

Instantly share code, notes, and snippets.

@joemiller
Created August 26, 2013 21:54
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joemiller/6347137 to your computer and use it in GitHub Desktop.
Save joemiller/6347137 to your computer and use it in GitHub Desktop.
check the outbound bandwidth on a host and compare against a table of rackspace cloud quotas. alert if near the limit
#!/usr/bin/env ruby
#
# check-rackspace-bandwidth-limits
# ================================
#
# Check the bandwidth usage of a network device on a Rackspace Cloud server
# and alert if it is nearing the threshold allowed for the image size.
#
# Requires sar(1) and sadf(1) to be installed (sysstat package, usually)
#
# Uses the last 60 minutes of sar data to calculate an average usage over that time
# and alerts if that average is within a certain percentage of hitting the rackspace
# bandwidth limits.
#
# examples:
#
# check if eth1 is within 90% of its limits over the last hour:
#
# ./check-rackspace-bandwidth-limits.rb -c 90 -i eth1
# CheckBandwidth CRITICAL: eth1 average TX rate is 91.16666667 % of max allowed (550 / 600 mbps)
#
require 'rubygems' if RUBY_VERSION < '1.9.0'
require 'sensu-plugin/check/cli'
class CheckBandwidth < Sensu::Plugin::Check::CLI
option :verbose, :short => '-v', :boolean => true, :default => false
option :iface, :short => '-i IFACE', :default => 'eth0'
option :warn, :long => "--warn PERCENT", :default => 90, :proc => Proc.new { |w| w.to_f }
option :crit, :long => "--crit PERCENT", :default => 95, :proc => Proc.new { |w| w.to_f }
# Rackspace Cloud bandwidth quotas for each instance size on this page:
# http://www.rackspace.com/knowledge_center/product-faq/cloud-servers
RACKSPACE_CLOUD_BW_LIMITS = {
'512MB' => { 'eth0' => 20, 'eth1' => 40 }, # mbits/sec
'1024MB' => { 'eth0' => 30, 'eth1' => 60 },
'2048MB' => { 'eth0' => 60, 'eth1' => 120 },
'4096MB' => { 'eth0' => 100, 'eth1' => 200 },
'8192MB' => { 'eth0' => 150, 'eth1' => 300 },
'15872MB' => { 'eth0' => 200, 'eth1' => 400 },
'30720MB' => { 'eth0' => 300, 'eth1' => 600 },
}
# try to figure out the size of this Rackspace cloud server instance.
# The exact amount of RAM varies between generations of their software stack,
# so we use some heuristics to figure out what we are.
def rackspace_image_size
mem_kb = `cat /proc/meminfo | grep MemTotal | awk '{print $2}'`.to_i
mem_mb = mem_kb / 1024
rs_size = case mem_mb
when 400..600 then '512MB'
when 900..1200 then '1024MB'
when 1800..2200 then '2048MB'
when 7800..8400 then '8192MB'
when 14000..16200 then '15872MB'
when 28000..33000 then '30720MB'
else raise "Could not determine rackspace image type from mem MB: #{mem_mb}"
end
rs_size
end
# start_time is passed to date(1) -d param
#
# uses sar(1) data to return a set of statistics about the networking
# devies of the system from start_time until now, eg:
#
# get_netstats('60 minutes ago') =>
# => { 'eth0' => { 'num_samples' => 6,
# 'txpkts_sec_average' => 1200,
# 'txpkts_sec_max' => 5000,
# 'rxpkts_sec_average' => 1200,
# 'rxpkts_sec_max' => 5000,
# 'txkB_sec_average' => 32033.2,
# 'txKB_sec_max' => 52123.2,
# 'rxkB_sec_average' => 32033.2,
# 'rxKB_sec_max' => 52123.2'
# },
# 'eth1' .....
# }
def get_netstats(start_time='60 minutes ago')
now = Time.now
day_of_month = now.day
start = `date +%H:%M:%S -d '#{start_time}'`
# auto-vivification - http://alisdair.mcdiarmid.org/2012/09/01/auto-vivifying-hash.html
stats = Hash.new {|h, k| h[k] = Hash.new(0) }
out = `sadf -d /var/log/sa/sa#{day_of_month} -- -n DEV -s #{start}`
if out == ''
ok "no data returned by sadf. It's possible that the sar data just rolled over to a new day and needs some time to collect data"
end
out.split("\n").each do |l|
next if l =~ /^#/
l.strip!
(host, interval, timestamp, dev, rxpkts, txpkts, rxkb, txkb, rxcmp, txcmp, rxmcst) = l.split(';')
stats[dev]['num_samples'] += 1
rxkb = rxkb.to_f ; txkb = txkb.to_f
rxpkts = rxpkts.to_f ; txpkts = txpkts.to_f
# set max values
stats[dev]['rxpkts_sec_max'] = rxpkts if rxpkts > stats[dev]['rxpkts_sec_max']
stats[dev]['txpkts_sec_max'] = txpkts if txpkts > stats[dev]['txpkts_sec_max']
stats[dev]['rxKB_sec_max'] = rxkb if rxkb > stats[dev]['rxKB_sec_max']
stats[dev]['txKB_sec_max'] = txkb if txkb > stats[dev]['txKB_sec_max']
# cumulative average
stats[dev]['rxpkts_sec_sum'] = stats[dev]['rxpkts_sec_sum'] + rxpkts
stats[dev]['rxpkts_sec_average'] = stats[dev]['rxpkts_sec_sum'] / stats[dev]['num_samples']
stats[dev]['txpkts_sec_sum'] = stats[dev]['txpkts_sec_sum'] + txpkts
stats[dev]['txpkts_sec_average'] = stats[dev]['txpkts_sec_sum'] / stats[dev]['num_samples']
stats[dev]['rxKB_sec_sum'] = stats[dev]['rxKB_sec_sum'] + rxkb
stats[dev]['rxKB_sec_average'] = stats[dev]['rxKB_sec_sum'] / stats[dev]['num_samples']
stats[dev]['txKB_sec_sum'] = stats[dev]['txKB_sec_sum'] + txkb
stats[dev]['txKB_sec_average'] = stats[dev]['txKB_sec_sum'] / stats[dev]['num_samples']
end
stats
end
def run
dev = config[:iface]
image_type = rackspace_image_size
puts "Detected Rackspace image size: #{image_type}" if config[:verbose]
stats = get_netstats
puts "Checking stats on interface: #{dev}" if config[:verbose]
max_mbps = RACKSPACE_CLOUD_BW_LIMITS[image_type][dev]
# don't forget to convert stats data from bytes to bits
tx_rate_mbps = (stats[dev]['txKB_sec_average'] * 8) / 1000
puts "Interface #{dev} tx rate: #{tx_rate_mbps} mbps" if config[:verbose]
usage_percent = (tx_rate_mbps / max_mbps) * 100
puts "Interface #{dev} tx rate is #{usage_percent}%" if config[:verbose]
msg = "#{dev} average TX rate is #{usage_percent} % of max allowed (#{tx_rate_mbps.to_i} / #{max_mbps} mbps)"
if usage_percent >= config[:crit]
critical msg
elsif usage_percent >= config[:warn]
warning msg
else
ok msg
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment