Skip to content

Instantly share code, notes, and snippets.

@zanloy
Created November 3, 2014 13:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zanloy/f48f0f7b463cd6f9fd82 to your computer and use it in GitHub Desktop.
Save zanloy/f48f0f7b463cd6f9fd82 to your computer and use it in GitHub Desktop.
Systems Engineering MONitoring Script (SEMON)
<html>
<head>
<style>
ul {
margin: 0px;
list-style-type: none;
}
li {
margin: 0px;
}
</style>
</head>
<body>
<h3>Health status summary:</h3>
<% if summary %>
<p><%= summary %>
<% end %>
<h3>Systems with >90% disk(s):</h3>
<% if count[:big_disks] > 0 %>
<% big_disks.each do |fqdn,disks| %>
<h4><%= fqdn %></h4>
<ul>
<% disks.each do |disk| %>
<li><%= disk %></li>
<% end %>
</ul>
<% end %>
<% else %>
<p>There are no systems with high disk utilization.</p>
<% end %>
<h3>Systems with >4.0 load avg for past 1 minute:</h3>
<% if count[:high_loads] > 0 %>
<% high_loads.each do |fqdn,loadavg| %>
<p><%= fqdn %>: <%= loadavg %></p>
<% end %>
<% else %>
<p>There are no systems with high loads averages.</p>
<% end %>
<% if proxy_statuses %>
<h3>Proxy Statuses:</h3>
<% proxy_statuses.each do |fqdn,result| %>
<p>
<%= fqdn %>: <%= result[:status] %>.
<% if result[:timesince_str] %>
The last request was processed <%= result[:timesince_str] %>ago.
<% end %>
</p>
<% end %>
<% end %>
<% if errors %>
<h3>Errors:</h3>
<% errors.each do |fqdn,error| %>
<p><%= fqdn %>: <%= error %></p>
<% end %>
<% end %>
</body>
</html>
#!/usr/bin/env ruby
#stdlib
require 'erb'
require 'optparse'
require 'pp'
#ruby gems
require 'rubygems'
require 'net/ssh'
require 'mail'
def gather_stats(server, timeout = 10)
rtn = {}
rtn[:disks] = []
rtn[:big_disks] = []
rtn[:environment] = server.split('.')[1]
begin
Net::SSH.start(server, 'zloy', {:timeout => timeout}) do |ssh|
rtn[:fqdn] = ssh.exec!("hostname -f").chomp!
rtn[:hostname] = ssh.exec!("hostname").chomp!
#Gather metric #1... Disk Utilization!
ssh.exec!('df -hP | sed "1 d"').strip!.each_line do |disk|
disk.chomp!
rtn[:disks] << disk
disktab = disk.split()
if disktab[4][0..-1].to_i > 90
if rtn[:environment] == "prd"
excluded = ['/data','/data1','/data2','/data3','/data4','/data5','/data6','/data7','/data8']
rtn[:excluded] = excluded.join(",")
rtn[:big_disks] << "#{disktab[5]} is #{disktab[4]} full and has #{disktab[3]} free." unless excluded.include?(disk.split[5])
else
rtn[:big_disks] << disk
end
end
end
#Gather metric #2... Load Averages!
rtn[:load] = ssh.exec!("cat /proc/loadavg | cut -d' ' -f 1-3").chomp!
rtn[:highload] = true if rtn[:load].split[0].to_f > 4.0
#Gather metric #3... Proxy Responses!
if /(ext|svc|int)pxy/ =~ rtn[:hostname]
rtn[:pxy] = {}
rtn[:pxy][:cnt] = 30
servername = ssh.exec!("sudo grep ServerName /etc/httpd/sites/*gov.conf").chomp!.strip!.split[1]
resps = ssh.exec!("sudo tail -n #{rtn[:pxy][:cnt]} /var/log/httpd/#{servername}_access.log | cut -d' ' -f 3 | tr \"\n\" ' '")
if resps
rtn[:pxy][:status_200] = resps.scan(/200/).count
rtn[:pxy][:status_500] = resps.scan(/500/).count
if (rtn[:pxy][:status_500] / rtn[:pxy][:cnt]) * 100 < 15
rtn[:pxy][:state] = "good"
else
rtn[:pxy][:state] = "bad"
end
rtn[:pxy][:status] = "Out of last #{rtn[:pxy][:cnt]} requests: 200: #{rtn[:pxy][:status_200]}, 500: #{rtn[:pxy][:status_500]}"
last_request = ssh.exec!("sudo tail -n 1 /var/log/httpd/#{servername}_access.log | cut -d' ' -f 1,2")
if last_request
rtn[:pxy][:lastrequest] = DateTime.strptime(last_request, "[%d/%b/%Y:%H:%M:%S %Z]")
rtn[:pxy][:timesince] = Date.day_fraction_to_time(DateTime.now - rtn[:pxy][:lastrequest])
rtn[:pxy][:timesince_str] = ""
rtn[:pxy][:timesince_str] += "#{rtn[:pxy][:timesince][0]}h " if rtn[:pxy][:timesince][0] > 0
rtn[:pxy][:timesince_str] += "#{rtn[:pxy][:timesince][1]}m " if rtn[:pxy][:timesince][1] > 0
rtn[:pxy][:timesince_str] += "#{rtn[:pxy][:timesince][2]}s " if rtn[:pxy][:timesince][2] > 0
end
else
rtn[:pxy][:state] = "good"
rtn[:pxy][:status] = "access.log file is empty"
end
end
end #Net::SSH.start
rescue Net::SSH::AuthenticationFailed
rtn[:error] = "SSH authentication failed"
rescue Net::SSH::HostKeyMismatch => e
e.remember_host!
retry
rescue Timeout::Error => e
rtn[:error] = "SSH connection timed out"
rescue StandardError => e
rtn[:error] = e
#raise
end
return rtn
end
def get_hosts(string)
cmd = "/usr/bin/ansible #{string} --list-hosts"
rtn = []
`#{cmd}`.each_line do |line|
rtn << line.chomp!.strip!
end
return rtn
end
options = {:email => true, :debug => false, :timeout => 10}
OptionParser.new do |opts|
opts.banner = "Usage: semon.rb <environment/host>"
opts.on('-p', '--progress', 'Enable the progress bar.') { options[:progress] = true }
opts.on('--timeout TIMEOUT', "Set timeout to TIMEOUT. (Default: #{options[:timeout]})") { |timeout| options[:timeout] = timeout.to_i }
opts.on('-d', '--debug', 'Turn on script debugging output.') { options[:debug] = true; options[:progress] = true }
opts.on('-n', '--noemail', 'Disable email all together.') { options[:email] = false }
end.parse!
results = {}
hosts = get_hosts(ARGV[0])
puts "hosts = #{hosts.join(', ')}" if options[:debug]
if options[:progress]
require 'ruby-progressbar'
#progressbar = ProgressBar.create(:format => '%a %bᗧ%i %p%% %t', :progress_mark =>' ', :remainder_mark => '·', :total => hosts.length)
progressbar = ProgressBar.create(:format => '%a 8%bD%i %p%% %t', :progress_mark =>'=', :remainder_mark => ' ', :total => hosts.length)
end
hosts.each do |host|
progressbar.log "Processing: #{host}" if (options[:progress] and options[:debug])
results[host] = gather_stats(host)
progressbar.increment if options[:progress]
end
progressbar.finish if options[:progress]
begin
#Build vars for erb template
count = {:big_disks => 0, :high_loads => 0, :proxies => 0, :proxies_good => 0, :proxies_bad => 0, :proxies_processing => 0}
big_disks = {}
high_loads = {}
proxy_statuses = {}
errors = {}
results.each do |fqdn,result|
if result[:big_disks].length > 0
count[:big_disks] += 1
big_disks[fqdn] = result[:big_disks]
end
if result[:highload]
count[:high_loads] += 1
high_loads[fqdn] = result[:load]
end
if result.has_key?(:pxy)
count[:proxies] += 1
proxy_statuses[fqdn] = result[:pxy]
if result[:pxy][:state] == "good"
count[:proxies_good] += 1
else
count[:proxies_bad] += 1
end
if result[:pxy].has_key? :timesince
count[:proxies_processing] += 1 if (result[:pxy][:timesince][0] == 0 and result[:pxy][:timesince][1] < 5)
end
end
errors[fqdn] = result[:error] if result.has_key?(:error)
end
#Create a summary
summary = case
when count[:big_disks] > 1
"There are #{count[:big_disks]} servers with high disk utilization. "
when count[:big_disks] == 1
"There is 1 server with high disk utilization. "
when count[:big_disks] == 0
"There are no servers with high disk utilization. "
end
summary += case
when count[:high_loads] > 1
"There are #{count[:high_loads]} servers with high load averages. "
when count[:high_loads] == 1
"There is 1 server with a high load average. "
when count[:high_loads] == 0
"There are no servers with high load averages at this time. "
end
if count[:proxies] > 0
if count[:proxies_good] == count[:proxies] and count[:proxies_processing] == count[:proxies]
summary += "All proxies look good and are handling requests. "
else
# Figure first part of proxies summary (if the result codes are good.
if count[:proxies_good] == count[:proxies]
summary += case
when count[:proxies] == 1
"The proxy server is returning good result codes. "
else
"The proxy servers are returning good result codes. "
end
else # If not all results are good
summary += case
when count[:proxies_bad] == 1
if count[:proxies] == 1
"The proxy server has a high number of error codes. "
else
"One of the proxy servers has a high number of error codes. "
end #if
else
"There are #{count[:proxies_bad]} proxy servers with a high number of error codes. "
end #case
end #if
# Figure second part of proxies summary
if count[:proxies_processing] == count[:proxies]
summary += case
when count[:proxies] == 1
"The proxy server has processed requests recently. "
else
"The proxy servers have processed requests recently. "
end
else
summary += case
when (count[:proxies] - count[:proxies_processing]) == 1
if count[:proxies] == 1
"The proxy server has not processed any requests recently. "
else
"One of the proxy servers has not processed any requests recently. "
end #if
when count[:proxies_processing] == 0
"None of the proxy servers have processed any requests recently. "
else
"There are #{count[:proxies] - count[:proxies_processing]} proxy servers that have not processed any requests recently. "
end #case
end #if
end #if
end #if
#Build html body from erb
renderer = ERB.new(File.read('body.html.erb'), 0, '>')
body = renderer.result()
rescue StandardError => e
body = "Error: #{e}"
raise if options[:debug]
end
puts body if options[:debug]
if options[:email]
Mail.deliver do
from "Do_Not_Reply@example.com"
to "Engineering Team <engineering@example.com>" unless options[:debug]
to "Root <root@example.com>" if options[:debug]
subject "SEMon #{ARGV[0].capitalize} Status Report - #{Time.now.strftime("%Y-%m-%d %H:%M:%S")}"
html_part do
content_type 'text/html; charset=UTF-8'
body body
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment