Create a gist now

Instantly share code, notes, and snippets.

Monit configs collection in use within WPD SaltStack configs

Monit configuration on WebPlatform infrastructure

The monit project has two components; Monit, and M/Monit. While monit is open-source and can be used stand alone along with an independent web ui, M/Monit can manage from one place every machine that runs monit.

M/Monit can control monit configuration, creates a global cluster health graph all in "one place".

If the management don’t want to pay for a M/Monit licence its possible to shim a dashboard with a set of well-thought NGINX proxy configuration. While it would allow to control every nodes, there would be no overall network graphs.

Screenshots

Monit

Here are a few screenshots of monit client HTTP server reports.

M/Monit

Those are reports generated from M/Monit server that reads updates from a set of monit servers. Each monit client can send updates through their local set mmonit http://foo:bar@mmonit:8080/collector config).

check host fxa-auth-server with address "localhost"
start program = "/usr/sbin/service fxa-auth-server start"
stop program = "/usr/sbin/service fxa-auth-server stop"
if failed port 9000 protocol HTTP
request /
with timeout 10 seconds
then restart
depends on nginx
depends on fxa-content-server
check host fxa-content-server with address "localhost"
start program = "/usr/sbin/service fxa-content-server start"
stop program = "/usr/sbin/service fxa-content-server stop"
if failed port 3030 protocol HTTP
request /
with timeout 10 seconds
then restart
depends on nginx
check host fxa-oauth-server with address "localhost"
start program = "/usr/sbin/service fxa-oauth-server start"
stop program = "/usr/sbin/service fxa-oauth-server stop"
if failed port 9010 protocol HTTP
request /
with timeout 15 seconds
then restart
depends on nginx
check host fxa-profile-server with address "localhost"
start program = "/usr/sbin/service fxa-profile-server start"
stop program = "/usr/sbin/service fxa-profile-server stop"
if failed port 8081 protocol HTTP
request /
with timeout 10 seconds
then restart
depends on nginx
# See in salt/fxa/checks.sls
# salt/fxa/files/profile-check.sh
check program fxa-profile-server-check with path "/srv/webplatform/auth/profile-check.sh"
with timeout 20 seconds
start program = "/usr/sbin/service fxa-profile-server start"
stop program = "/usr/sbin/service fxa-profile-server stop"
if status != 0
then restart
depends on fxa-profile-server
# ref: http://mmonit.com/wiki/Monit/MonitorApacheStatus
check process apache2
with pidfile "/var/run/apache2/apache2.pid"
group www
start = "/usr/sbin/service apache2 start"
stop = "/usr/sbin/service apache2 stop"
if failed host 127.0.0.1 port 80
protocol apache-status loglimit > 80% or
dnslimit > 25% or
waitlimit < 20%
then restart
if failed host {{ ip4_interfaces[0]|default('127.0.0.1') }} port 80
protocol HTTP then restart
if not exist for 3 cycles then restart
if 5 restarts within 5 cycles then timeout
if 3 restarts within 5 cycles then alert
# Ref: https://github.com/elasticsearch/cookbook-elasticsearch/blob/master/templates/default/elasticsearch.monitrc.conf.erb
# #TODO Improve error email, good format example see ^
check process elasticsearch
with pidfile "/var/run/elasticsearch.pid"
group elasticsearch
start = "/usr/sbin/service elasticsearch start" with timeout 60 seconds
stop = "/usr/sbin/service elasticsearch stop"
if cpu > 90% for 5 cycles then alert
if totalmem > 90% for 15 cycles then alert
if loadavg(15min) greater than 10 for 50 cycles then alert
if not exist for 3 cycles then restart
if 5 restarts within 5 cycles then timeout
if 3 restarts within 5 cycles then alert
check host elasticsearch_connection with address {{ ip4_interfaces[0]|default('0.0.0.0') }}
if failed url http://{{ ip4_interfaces[0]|default('0.0.0.0') }}:{{ elastic_port|default(9200) }}/
with timeout 15 seconds
then alert
group elasticsearch
check host elasticsearch_cluster_health with address {{ ip4_interfaces[0]|default('0.0.0.0') }}
if failed url http://{{ ip4_interfaces[0]|default('0.0.0.0') }}:{{ elastic_port|default(9200) }}/_cluster/health
and content == 'green'
with timeout 60 seconds
then alert
group elasticsearch
check process exim4
with pidfile "/var/run/exim4/exim.pid"
group mail
start = "/usr/sbin/service exim4 start"
stop = "/usr/sbin/service exim4 stop"
if failed port 25 protocol SMTP then restart
if not exist for 3 cycles then restart
if 3 restarts within 5 cycles then alert
if 5 restarts within 5 cycles then timeout
check process gdnsd
with pidfile "/var/run/gdnsd/gdnsd.pid"
start = "/usr/sbin/service gdnsd start"
stop = "/usr/sbin/service gdnsd stop"
if failed port 53 protocol DNS then restart
if not exist for 3 cycles then restart
if 5 restarts within 5 cycles then timeout
if 3 restarts within 5 cycles then alert
# http://www.alphadevx.com/a/392-Monitoring-Memcache-with-Monit
check process memcached
with pidfile "/var/run/memcached.pid"
group keystore
start = "/usr/sbin/service memcached start"
stop = "/usr/sbin/service memcached stop"
if failed host 127.0.0.1 port 11211 protocol MEMCACHE then restart
if not exist for 3 cycles then restart
if 3 restarts within 5 cycles then alert
if 5 restarts within 5 cycles then timeout
# Managed by Salt Stack, please DO NOT TOUCH, or ALL CHANGES WILL be LOST!
#
# This file should contain only what’s common for EVERY nodes
#
#
# Ref:
# - http://mmonit.com/monit/documentation/monit.html
#
check system {{ nodename }}
if loadavg (1min) > 4 then alert
if loadavg (5min) > 2 then alert
if memory usage > 75% then alert
if swap usage > 25% then alert
if cpu usage (user) > 70% then alert
if cpu usage (system) > 30% then alert
if cpu usage (wait) > 20% then alert
check process salt-minion
with pidfile "/var/run/salt-minion.pid"
group salt
start = "/usr/sbin/service salt-minion start"
stop = "/usr/sbin/service salt-minion stop"
if not exist for 3 cycles then restart
if 5 restarts within 5 cycles then timeout
set httpd port {{ monit_port|default(2812) }} and
use address localhost # only accept connection from localhost
allow localhost # allow localhost to connect to the server and
allow 10.10.10.0/24
allow admin:{{ monit_pw }} # require user 'admin' with password that is defined in Salt Stack
allow @monit # allow users of group 'monit' to connect (rw)
allow @users readonly # allow users of group 'users' to connect readonly
set mailserver mail.{{ tld }}
using sslauto
set mail-format { from: monit@{{ nodename }} }
set alert hostmaster@{{ tld }} not on { pid ppid }
# Get config defaults
#
# Run:
#
# /usr/sbin/mysqld --print-defaults
#
check process mysql
with pidfile "/var/run/mysqld/mysqld.pid"
group database
start = "/usr/sbin/service mysql start"
stop = "/usr/sbin/service mysql stop"
if failed host {{ ip4_interfaces[0]|default('127.0.0.1') }} port 3306
protocol MYSQL then restart
if not exist for 3 cycles then restart
if 5 restarts within 5 cycles then timeout
if 3 restarts within 5 cycles then alert
check process nginx
with pidfile /var/run/nginx.pid
group www
group nginx
start program = "/usr/sbin/service nginx start"
stop program = "/usr/sbin/service nginx stop"
if failed host {{ ip4_interfaces[0]|default('127.0.0.1') }} port 80
protocol HTTP then restart
if 5 restarts with 5 cycles then timeout
depend nginx_bin
depend nginx_rc
check file nginx_bin with path /usr/sbin/nginx
group nginx
include /etc/monit/templates/rootbin
check file nginx_rc with path /etc/init.d/nginx
group nginx
include /etc/monit/templates/rootbin
{#
# Expected variables and values:
# - hypothesis_host: '127.0.0.1'
# - hypothesis_port: 8000
# - elastic_host: '10.10.10.2'
# - elastic_port: 9002
#}
check process hypothesis
matching "notes-server"
start program = "/usr/sbin/service hypothesis start"
stop program = "/usr/sbin/service hypothesis stop"
if failed port {{ hypothesis_port }}
type TCP with timeout 10 seconds
then restart
if 5 restarts with 5 cycles then timeout
check host elasticsearch-remote with address "{{ elastic_host }}"
if failed port {{ elastic_port }} protocol HTTP
request "/_aliases"
with timeout 10 seconds
then alert
check host hypothesis-available with address "{{ hypothesis_host }}"
start program = "/usr/sbin/service hypothesis start"
stop program = "/usr/sbin/service hypothesis stop"
if failed port {{ hypothesis_port }}
protocol HTTP request "/ruok" with timeout 10 seconds
then restart
#
# Ref:
# - https://github.com/twitter/twemproxy
#
# To check stats, look result on stats port 22222
#
# curl http://localhost:22222 | python -m json.tool
#
check process nutcracker
matching "nutcracker"
group keystore
start = "/usr/sbin/service nutcracker start"
stop = "/usr/sbin/service nutcracker stop"
if failed host 127.0.0.1 port 22222 type TCP
with timeout 2 seconds
then restart
if not exist for 3 cycles then restart
if 3 restarts within 5 cycles then alert
if 5 restarts within 5 cycles then timeout
# http://tobias.is/blog/to-boldly-monitor-what-no-one-has-monitored-before/
check process php5-fpm
with pidfile "/var/run/php5-fpm.pid"
group php5-fpm
start = "/usr/sbin/service php5-fpm start" with timeout 60 seconds
stop = "/usr/sbin/service php5-fpm stop"
if failed host {{ ip4_interface|default('0.0.0.0') }} port {{ fpm_port|default(9000) }} type TCP then restart
if not exist for 3 cycles then restart
if 5 restarts within 5 cycles then timeout
if 3 restarts within 5 cycles then alert
check process salt-master
with pidfile "/var/run/salt-master.pid"
group salt
start = "/usr/sbin/service salt-master start"
stop = "/usr/sbin/service salt-master stop"
if not exist for 3 cycles then restart
if 3 restarts within 5 cycles then alert
if 5 restarts within 5 cycles then timeout
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment