Created
November 3, 2014 13:15
-
-
Save zanloy/f48f0f7b463cd6f9fd82 to your computer and use it in GitHub Desktop.
Systems Engineering MONitoring Script (SEMON)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<html> | |
<head> | |
<style> | |
ul { | |
margin: 0px; | |
list-style-type: none; | |
} | |
li { | |
margin: 0px; | |
} | |
</style> | |
</head> | |
<body> | |
<h3>Health status summary:</h3> | |
<% if summary %> | |
<p><%= summary %> | |
<% end %> | |
<h3>Systems with >90% disk(s):</h3> | |
<% if count[:big_disks] > 0 %> | |
<% big_disks.each do |fqdn,disks| %> | |
<h4><%= fqdn %></h4> | |
<ul> | |
<% disks.each do |disk| %> | |
<li><%= disk %></li> | |
<% end %> | |
</ul> | |
<% end %> | |
<% else %> | |
<p>There are no systems with high disk utilization.</p> | |
<% end %> | |
<h3>Systems with >4.0 load avg for past 1 minute:</h3> | |
<% if count[:high_loads] > 0 %> | |
<% high_loads.each do |fqdn,loadavg| %> | |
<p><%= fqdn %>: <%= loadavg %></p> | |
<% end %> | |
<% else %> | |
<p>There are no systems with high loads averages.</p> | |
<% end %> | |
<% if proxy_statuses %> | |
<h3>Proxy Statuses:</h3> | |
<% proxy_statuses.each do |fqdn,result| %> | |
<p> | |
<%= fqdn %>: <%= result[:status] %>. | |
<% if result[:timesince_str] %> | |
The last request was processed <%= result[:timesince_str] %>ago. | |
<% end %> | |
</p> | |
<% end %> | |
<% end %> | |
<% if errors %> | |
<h3>Errors:</h3> | |
<% errors.each do |fqdn,error| %> | |
<p><%= fqdn %>: <%= error %></p> | |
<% end %> | |
<% end %> | |
</body> | |
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
#stdlib | |
require 'erb' | |
require 'optparse' | |
require 'pp' | |
#ruby gems | |
require 'rubygems' | |
require 'net/ssh' | |
require 'mail' | |
def gather_stats(server, timeout = 10) | |
rtn = {} | |
rtn[:disks] = [] | |
rtn[:big_disks] = [] | |
rtn[:environment] = server.split('.')[1] | |
begin | |
Net::SSH.start(server, 'zloy', {:timeout => timeout}) do |ssh| | |
rtn[:fqdn] = ssh.exec!("hostname -f").chomp! | |
rtn[:hostname] = ssh.exec!("hostname").chomp! | |
#Gather metric #1... Disk Utilization! | |
ssh.exec!('df -hP | sed "1 d"').strip!.each_line do |disk| | |
disk.chomp! | |
rtn[:disks] << disk | |
disktab = disk.split() | |
if disktab[4][0..-1].to_i > 90 | |
if rtn[:environment] == "prd" | |
excluded = ['/data','/data1','/data2','/data3','/data4','/data5','/data6','/data7','/data8'] | |
rtn[:excluded] = excluded.join(",") | |
rtn[:big_disks] << "#{disktab[5]} is #{disktab[4]} full and has #{disktab[3]} free." unless excluded.include?(disk.split[5]) | |
else | |
rtn[:big_disks] << disk | |
end | |
end | |
end | |
#Gather metric #2... Load Averages! | |
rtn[:load] = ssh.exec!("cat /proc/loadavg | cut -d' ' -f 1-3").chomp! | |
rtn[:highload] = true if rtn[:load].split[0].to_f > 4.0 | |
#Gather metric #3... Proxy Responses! | |
if /(ext|svc|int)pxy/ =~ rtn[:hostname] | |
rtn[:pxy] = {} | |
rtn[:pxy][:cnt] = 30 | |
servername = ssh.exec!("sudo grep ServerName /etc/httpd/sites/*gov.conf").chomp!.strip!.split[1] | |
resps = ssh.exec!("sudo tail -n #{rtn[:pxy][:cnt]} /var/log/httpd/#{servername}_access.log | cut -d' ' -f 3 | tr \"\n\" ' '") | |
if resps | |
rtn[:pxy][:status_200] = resps.scan(/200/).count | |
rtn[:pxy][:status_500] = resps.scan(/500/).count | |
if (rtn[:pxy][:status_500] / rtn[:pxy][:cnt]) * 100 < 15 | |
rtn[:pxy][:state] = "good" | |
else | |
rtn[:pxy][:state] = "bad" | |
end | |
rtn[:pxy][:status] = "Out of last #{rtn[:pxy][:cnt]} requests: 200: #{rtn[:pxy][:status_200]}, 500: #{rtn[:pxy][:status_500]}" | |
last_request = ssh.exec!("sudo tail -n 1 /var/log/httpd/#{servername}_access.log | cut -d' ' -f 1,2") | |
if last_request | |
rtn[:pxy][:lastrequest] = DateTime.strptime(last_request, "[%d/%b/%Y:%H:%M:%S %Z]") | |
rtn[:pxy][:timesince] = Date.day_fraction_to_time(DateTime.now - rtn[:pxy][:lastrequest]) | |
rtn[:pxy][:timesince_str] = "" | |
rtn[:pxy][:timesince_str] += "#{rtn[:pxy][:timesince][0]}h " if rtn[:pxy][:timesince][0] > 0 | |
rtn[:pxy][:timesince_str] += "#{rtn[:pxy][:timesince][1]}m " if rtn[:pxy][:timesince][1] > 0 | |
rtn[:pxy][:timesince_str] += "#{rtn[:pxy][:timesince][2]}s " if rtn[:pxy][:timesince][2] > 0 | |
end | |
else | |
rtn[:pxy][:state] = "good" | |
rtn[:pxy][:status] = "access.log file is empty" | |
end | |
end | |
end #Net::SSH.start | |
rescue Net::SSH::AuthenticationFailed | |
rtn[:error] = "SSH authentication failed" | |
rescue Net::SSH::HostKeyMismatch => e | |
e.remember_host! | |
retry | |
rescue Timeout::Error => e | |
rtn[:error] = "SSH connection timed out" | |
rescue StandardError => e | |
rtn[:error] = e | |
#raise | |
end | |
return rtn | |
end | |
def get_hosts(string) | |
cmd = "/usr/bin/ansible #{string} --list-hosts" | |
rtn = [] | |
`#{cmd}`.each_line do |line| | |
rtn << line.chomp!.strip! | |
end | |
return rtn | |
end | |
options = {:email => true, :debug => false, :timeout => 10} | |
OptionParser.new do |opts| | |
opts.banner = "Usage: semon.rb <environment/host>" | |
opts.on('-p', '--progress', 'Enable the progress bar.') { options[:progress] = true } | |
opts.on('--timeout TIMEOUT', "Set timeout to TIMEOUT. (Default: #{options[:timeout]})") { |timeout| options[:timeout] = timeout.to_i } | |
opts.on('-d', '--debug', 'Turn on script debugging output.') { options[:debug] = true; options[:progress] = true } | |
opts.on('-n', '--noemail', 'Disable email all together.') { options[:email] = false } | |
end.parse! | |
results = {} | |
hosts = get_hosts(ARGV[0]) | |
puts "hosts = #{hosts.join(', ')}" if options[:debug] | |
if options[:progress] | |
require 'ruby-progressbar' | |
#progressbar = ProgressBar.create(:format => '%a %bᗧ%i %p%% %t', :progress_mark =>' ', :remainder_mark => '·', :total => hosts.length) | |
progressbar = ProgressBar.create(:format => '%a 8%bD%i %p%% %t', :progress_mark =>'=', :remainder_mark => ' ', :total => hosts.length) | |
end | |
hosts.each do |host| | |
progressbar.log "Processing: #{host}" if (options[:progress] and options[:debug]) | |
results[host] = gather_stats(host) | |
progressbar.increment if options[:progress] | |
end | |
progressbar.finish if options[:progress] | |
begin | |
#Build vars for erb template | |
count = {:big_disks => 0, :high_loads => 0, :proxies => 0, :proxies_good => 0, :proxies_bad => 0, :proxies_processing => 0} | |
big_disks = {} | |
high_loads = {} | |
proxy_statuses = {} | |
errors = {} | |
results.each do |fqdn,result| | |
if result[:big_disks].length > 0 | |
count[:big_disks] += 1 | |
big_disks[fqdn] = result[:big_disks] | |
end | |
if result[:highload] | |
count[:high_loads] += 1 | |
high_loads[fqdn] = result[:load] | |
end | |
if result.has_key?(:pxy) | |
count[:proxies] += 1 | |
proxy_statuses[fqdn] = result[:pxy] | |
if result[:pxy][:state] == "good" | |
count[:proxies_good] += 1 | |
else | |
count[:proxies_bad] += 1 | |
end | |
if result[:pxy].has_key? :timesince | |
count[:proxies_processing] += 1 if (result[:pxy][:timesince][0] == 0 and result[:pxy][:timesince][1] < 5) | |
end | |
end | |
errors[fqdn] = result[:error] if result.has_key?(:error) | |
end | |
#Create a summary | |
summary = case | |
when count[:big_disks] > 1 | |
"There are #{count[:big_disks]} servers with high disk utilization. " | |
when count[:big_disks] == 1 | |
"There is 1 server with high disk utilization. " | |
when count[:big_disks] == 0 | |
"There are no servers with high disk utilization. " | |
end | |
summary += case | |
when count[:high_loads] > 1 | |
"There are #{count[:high_loads]} servers with high load averages. " | |
when count[:high_loads] == 1 | |
"There is 1 server with a high load average. " | |
when count[:high_loads] == 0 | |
"There are no servers with high load averages at this time. " | |
end | |
if count[:proxies] > 0 | |
if count[:proxies_good] == count[:proxies] and count[:proxies_processing] == count[:proxies] | |
summary += "All proxies look good and are handling requests. " | |
else | |
# Figure first part of proxies summary (if the result codes are good. | |
if count[:proxies_good] == count[:proxies] | |
summary += case | |
when count[:proxies] == 1 | |
"The proxy server is returning good result codes. " | |
else | |
"The proxy servers are returning good result codes. " | |
end | |
else # If not all results are good | |
summary += case | |
when count[:proxies_bad] == 1 | |
if count[:proxies] == 1 | |
"The proxy server has a high number of error codes. " | |
else | |
"One of the proxy servers has a high number of error codes. " | |
end #if | |
else | |
"There are #{count[:proxies_bad]} proxy servers with a high number of error codes. " | |
end #case | |
end #if | |
# Figure second part of proxies summary | |
if count[:proxies_processing] == count[:proxies] | |
summary += case | |
when count[:proxies] == 1 | |
"The proxy server has processed requests recently. " | |
else | |
"The proxy servers have processed requests recently. " | |
end | |
else | |
summary += case | |
when (count[:proxies] - count[:proxies_processing]) == 1 | |
if count[:proxies] == 1 | |
"The proxy server has not processed any requests recently. " | |
else | |
"One of the proxy servers has not processed any requests recently. " | |
end #if | |
when count[:proxies_processing] == 0 | |
"None of the proxy servers have processed any requests recently. " | |
else | |
"There are #{count[:proxies] - count[:proxies_processing]} proxy servers that have not processed any requests recently. " | |
end #case | |
end #if | |
end #if | |
end #if | |
#Build html body from erb | |
renderer = ERB.new(File.read('body.html.erb'), 0, '>') | |
body = renderer.result() | |
rescue StandardError => e | |
body = "Error: #{e}" | |
raise if options[:debug] | |
end | |
puts body if options[:debug] | |
if options[:email] | |
Mail.deliver do | |
from "Do_Not_Reply@example.com" | |
to "Engineering Team <engineering@example.com>" unless options[:debug] | |
to "Root <root@example.com>" if options[:debug] | |
subject "SEMon #{ARGV[0].capitalize} Status Report - #{Time.now.strftime("%Y-%m-%d %H:%M:%S")}" | |
html_part do | |
content_type 'text/html; charset=UTF-8' | |
body body | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment