Systems Engineering MONitoring Script (SEMON)
<html> | |
<head> | |
<style> | |
ul { | |
margin: 0px; | |
list-style-type: none; | |
} | |
li { | |
margin: 0px; | |
} | |
</style> | |
</head> | |
<body> | |
<h3>Health status summary:</h3> | |
<% if summary %> | |
<p><%= summary %> | |
<% end %> | |
<h3>Systems with >90% disk(s):</h3> | |
<% if count[:big_disks] > 0 %> | |
<% big_disks.each do |fqdn,disks| %> | |
<h4><%= fqdn %></h4> | |
<ul> | |
<% disks.each do |disk| %> | |
<li><%= disk %></li> | |
<% end %> | |
</ul> | |
<% end %> | |
<% else %> | |
<p>There are no systems with high disk utilization.</p> | |
<% end %> | |
<h3>Systems with >4.0 load avg for past 1 minute:</h3> | |
<% if count[:high_loads] > 0 %> | |
<% high_loads.each do |fqdn,loadavg| %> | |
<p><%= fqdn %>: <%= loadavg %></p> | |
<% end %> | |
<% else %> | |
<p>There are no systems with high loads averages.</p> | |
<% end %> | |
<% if proxy_statuses %> | |
<h3>Proxy Statuses:</h3> | |
<% proxy_statuses.each do |fqdn,result| %> | |
<p> | |
<%= fqdn %>: <%= result[:status] %>. | |
<% if result[:timesince_str] %> | |
The last request was processed <%= result[:timesince_str] %>ago. | |
<% end %> | |
</p> | |
<% end %> | |
<% end %> | |
<% if errors %> | |
<h3>Errors:</h3> | |
<% errors.each do |fqdn,error| %> | |
<p><%= fqdn %>: <%= error %></p> | |
<% end %> | |
<% end %> | |
</body> | |
</html> |
#!/usr/bin/env ruby | |
#stdlib | |
require 'erb' | |
require 'optparse' | |
require 'pp' | |
#ruby gems | |
require 'rubygems' | |
require 'net/ssh' | |
require 'mail' | |
def gather_stats(server, timeout = 10) | |
rtn = {} | |
rtn[:disks] = [] | |
rtn[:big_disks] = [] | |
rtn[:environment] = server.split('.')[1] | |
begin | |
Net::SSH.start(server, 'zloy', {:timeout => timeout}) do |ssh| | |
rtn[:fqdn] = ssh.exec!("hostname -f").chomp! | |
rtn[:hostname] = ssh.exec!("hostname").chomp! | |
#Gather metric #1... Disk Utilization! | |
ssh.exec!('df -hP | sed "1 d"').strip!.each_line do |disk| | |
disk.chomp! | |
rtn[:disks] << disk | |
disktab = disk.split() | |
if disktab[4][0..-1].to_i > 90 | |
if rtn[:environment] == "prd" | |
excluded = ['/data','/data1','/data2','/data3','/data4','/data5','/data6','/data7','/data8'] | |
rtn[:excluded] = excluded.join(",") | |
rtn[:big_disks] << "#{disktab[5]} is #{disktab[4]} full and has #{disktab[3]} free." unless excluded.include?(disk.split[5]) | |
else | |
rtn[:big_disks] << disk | |
end | |
end | |
end | |
#Gather metric #2... Load Averages! | |
rtn[:load] = ssh.exec!("cat /proc/loadavg | cut -d' ' -f 1-3").chomp! | |
rtn[:highload] = true if rtn[:load].split[0].to_f > 4.0 | |
#Gather metric #3... Proxy Responses! | |
if /(ext|svc|int)pxy/ =~ rtn[:hostname] | |
rtn[:pxy] = {} | |
rtn[:pxy][:cnt] = 30 | |
servername = ssh.exec!("sudo grep ServerName /etc/httpd/sites/*gov.conf").chomp!.strip!.split[1] | |
resps = ssh.exec!("sudo tail -n #{rtn[:pxy][:cnt]} /var/log/httpd/#{servername}_access.log | cut -d' ' -f 3 | tr \"\n\" ' '") | |
if resps | |
rtn[:pxy][:status_200] = resps.scan(/200/).count | |
rtn[:pxy][:status_500] = resps.scan(/500/).count | |
if (rtn[:pxy][:status_500] / rtn[:pxy][:cnt]) * 100 < 15 | |
rtn[:pxy][:state] = "good" | |
else | |
rtn[:pxy][:state] = "bad" | |
end | |
rtn[:pxy][:status] = "Out of last #{rtn[:pxy][:cnt]} requests: 200: #{rtn[:pxy][:status_200]}, 500: #{rtn[:pxy][:status_500]}" | |
last_request = ssh.exec!("sudo tail -n 1 /var/log/httpd/#{servername}_access.log | cut -d' ' -f 1,2") | |
if last_request | |
rtn[:pxy][:lastrequest] = DateTime.strptime(last_request, "[%d/%b/%Y:%H:%M:%S %Z]") | |
rtn[:pxy][:timesince] = Date.day_fraction_to_time(DateTime.now - rtn[:pxy][:lastrequest]) | |
rtn[:pxy][:timesince_str] = "" | |
rtn[:pxy][:timesince_str] += "#{rtn[:pxy][:timesince][0]}h " if rtn[:pxy][:timesince][0] > 0 | |
rtn[:pxy][:timesince_str] += "#{rtn[:pxy][:timesince][1]}m " if rtn[:pxy][:timesince][1] > 0 | |
rtn[:pxy][:timesince_str] += "#{rtn[:pxy][:timesince][2]}s " if rtn[:pxy][:timesince][2] > 0 | |
end | |
else | |
rtn[:pxy][:state] = "good" | |
rtn[:pxy][:status] = "access.log file is empty" | |
end | |
end | |
end #Net::SSH.start | |
rescue Net::SSH::AuthenticationFailed | |
rtn[:error] = "SSH authentication failed" | |
rescue Net::SSH::HostKeyMismatch => e | |
e.remember_host! | |
retry | |
rescue Timeout::Error => e | |
rtn[:error] = "SSH connection timed out" | |
rescue StandardError => e | |
rtn[:error] = e | |
#raise | |
end | |
return rtn | |
end | |
def get_hosts(string) | |
cmd = "/usr/bin/ansible #{string} --list-hosts" | |
rtn = [] | |
`#{cmd}`.each_line do |line| | |
rtn << line.chomp!.strip! | |
end | |
return rtn | |
end | |
options = {:email => true, :debug => false, :timeout => 10} | |
OptionParser.new do |opts| | |
opts.banner = "Usage: semon.rb <environment/host>" | |
opts.on('-p', '--progress', 'Enable the progress bar.') { options[:progress] = true } | |
opts.on('--timeout TIMEOUT', "Set timeout to TIMEOUT. (Default: #{options[:timeout]})") { |timeout| options[:timeout] = timeout.to_i } | |
opts.on('-d', '--debug', 'Turn on script debugging output.') { options[:debug] = true; options[:progress] = true } | |
opts.on('-n', '--noemail', 'Disable email all together.') { options[:email] = false } | |
end.parse! | |
results = {} | |
hosts = get_hosts(ARGV[0]) | |
puts "hosts = #{hosts.join(', ')}" if options[:debug] | |
if options[:progress] | |
require 'ruby-progressbar' | |
#progressbar = ProgressBar.create(:format => '%a %bᗧ%i %p%% %t', :progress_mark =>' ', :remainder_mark => '·', :total => hosts.length) | |
progressbar = ProgressBar.create(:format => '%a 8%bD%i %p%% %t', :progress_mark =>'=', :remainder_mark => ' ', :total => hosts.length) | |
end | |
hosts.each do |host| | |
progressbar.log "Processing: #{host}" if (options[:progress] and options[:debug]) | |
results[host] = gather_stats(host) | |
progressbar.increment if options[:progress] | |
end | |
progressbar.finish if options[:progress] | |
begin | |
#Build vars for erb template | |
count = {:big_disks => 0, :high_loads => 0, :proxies => 0, :proxies_good => 0, :proxies_bad => 0, :proxies_processing => 0} | |
big_disks = {} | |
high_loads = {} | |
proxy_statuses = {} | |
errors = {} | |
results.each do |fqdn,result| | |
if result[:big_disks].length > 0 | |
count[:big_disks] += 1 | |
big_disks[fqdn] = result[:big_disks] | |
end | |
if result[:highload] | |
count[:high_loads] += 1 | |
high_loads[fqdn] = result[:load] | |
end | |
if result.has_key?(:pxy) | |
count[:proxies] += 1 | |
proxy_statuses[fqdn] = result[:pxy] | |
if result[:pxy][:state] == "good" | |
count[:proxies_good] += 1 | |
else | |
count[:proxies_bad] += 1 | |
end | |
if result[:pxy].has_key? :timesince | |
count[:proxies_processing] += 1 if (result[:pxy][:timesince][0] == 0 and result[:pxy][:timesince][1] < 5) | |
end | |
end | |
errors[fqdn] = result[:error] if result.has_key?(:error) | |
end | |
#Create a summary | |
summary = case | |
when count[:big_disks] > 1 | |
"There are #{count[:big_disks]} servers with high disk utilization. " | |
when count[:big_disks] == 1 | |
"There is 1 server with high disk utilization. " | |
when count[:big_disks] == 0 | |
"There are no servers with high disk utilization. " | |
end | |
summary += case | |
when count[:high_loads] > 1 | |
"There are #{count[:high_loads]} servers with high load averages. " | |
when count[:high_loads] == 1 | |
"There is 1 server with a high load average. " | |
when count[:high_loads] == 0 | |
"There are no servers with high load averages at this time. " | |
end | |
if count[:proxies] > 0 | |
if count[:proxies_good] == count[:proxies] and count[:proxies_processing] == count[:proxies] | |
summary += "All proxies look good and are handling requests. " | |
else | |
# Figure first part of proxies summary (if the result codes are good. | |
if count[:proxies_good] == count[:proxies] | |
summary += case | |
when count[:proxies] == 1 | |
"The proxy server is returning good result codes. " | |
else | |
"The proxy servers are returning good result codes. " | |
end | |
else # If not all results are good | |
summary += case | |
when count[:proxies_bad] == 1 | |
if count[:proxies] == 1 | |
"The proxy server has a high number of error codes. " | |
else | |
"One of the proxy servers has a high number of error codes. " | |
end #if | |
else | |
"There are #{count[:proxies_bad]} proxy servers with a high number of error codes. " | |
end #case | |
end #if | |
# Figure second part of proxies summary | |
if count[:proxies_processing] == count[:proxies] | |
summary += case | |
when count[:proxies] == 1 | |
"The proxy server has processed requests recently. " | |
else | |
"The proxy servers have processed requests recently. " | |
end | |
else | |
summary += case | |
when (count[:proxies] - count[:proxies_processing]) == 1 | |
if count[:proxies] == 1 | |
"The proxy server has not processed any requests recently. " | |
else | |
"One of the proxy servers has not processed any requests recently. " | |
end #if | |
when count[:proxies_processing] == 0 | |
"None of the proxy servers have processed any requests recently. " | |
else | |
"There are #{count[:proxies] - count[:proxies_processing]} proxy servers that have not processed any requests recently. " | |
end #case | |
end #if | |
end #if | |
end #if | |
#Build html body from erb | |
renderer = ERB.new(File.read('body.html.erb'), 0, '>') | |
body = renderer.result() | |
rescue StandardError => e | |
body = "Error: #{e}" | |
raise if options[:debug] | |
end | |
puts body if options[:debug] | |
if options[:email] | |
Mail.deliver do | |
from "Do_Not_Reply@example.com" | |
to "Engineering Team <engineering@example.com>" unless options[:debug] | |
to "Root <root@example.com>" if options[:debug] | |
subject "SEMon #{ARGV[0].capitalize} Status Report - #{Time.now.strftime("%Y-%m-%d %H:%M:%S")}" | |
html_part do | |
content_type 'text/html; charset=UTF-8' | |
body body | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment