Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Monit configs collection in use within WPD SaltStack configs

Monit configuration on WebPlatform infrastructure

The monit project has two components; Monit, and M/Monit. While monit is open-source and can be used stand alone along with an independent web ui, M/Monit can manage from one place every machine that runs monit.

M/Monit can control monit configuration, creates a global cluster health graph all in "one place".

If the management don’t want to pay for a M/Monit licence its possible to shim a dashboard with a set of well-thought NGINX proxy configuration. While it would allow to control every nodes, there would be no overall network graphs.

Screenshots

Monit

Here are a few screenshots of monit client HTTP server reports.

M/Monit

Those are reports generated from M/Monit server that reads updates from a set of monit servers. Each monit client can send updates through their local set mmonit http://foo:bar@mmonit:8080/collector config).

check host fxa-auth-server with address "localhost"
start program = "/usr/sbin/service fxa-auth-server start"
stop program = "/usr/sbin/service fxa-auth-server stop"
if failed port 9000 protocol HTTP
request /
with timeout 10 seconds
then restart
depends on nginx
depends on fxa-content-server
check host fxa-content-server with address "localhost"
start program = "/usr/sbin/service fxa-content-server start"
stop program = "/usr/sbin/service fxa-content-server stop"
if failed port 3030 protocol HTTP
request /
with timeout 10 seconds
then restart
depends on nginx
check host fxa-oauth-server with address "localhost"
start program = "/usr/sbin/service fxa-oauth-server start"
stop program = "/usr/sbin/service fxa-oauth-server stop"
if failed port 9010 protocol HTTP
request /
with timeout 15 seconds
then restart
depends on nginx
check host fxa-profile-server with address "localhost"
start program = "/usr/sbin/service fxa-profile-server start"
stop program = "/usr/sbin/service fxa-profile-server stop"
if failed port 8081 protocol HTTP
request /
with timeout 10 seconds
then restart
depends on nginx
# See in salt/fxa/checks.sls
# salt/fxa/files/profile-check.sh
check program fxa-profile-server-check with path "/srv/webplatform/auth/profile-check.sh"
with timeout 20 seconds
start program = "/usr/sbin/service fxa-profile-server start"
stop program = "/usr/sbin/service fxa-profile-server stop"
if status != 0
then restart
depends on fxa-profile-server
# ref: http://mmonit.com/wiki/Monit/MonitorApacheStatus
check process apache2
with pidfile "/var/run/apache2/apache2.pid"
group www
start = "/usr/sbin/service apache2 start"
stop = "/usr/sbin/service apache2 stop"
if failed host 127.0.0.1 port 80
protocol apache-status loglimit > 80% or
dnslimit > 25% or
waitlimit < 20%
then restart
if failed host {{ ip4_interfaces[0]|default('127.0.0.1') }} port 80
protocol HTTP then restart
if not exist for 3 cycles then restart
if 5 restarts within 5 cycles then timeout
if 3 restarts within 5 cycles then alert
# Ref: https://github.com/elasticsearch/cookbook-elasticsearch/blob/master/templates/default/elasticsearch.monitrc.conf.erb
# #TODO Improve error email, good format example see ^
check process elasticsearch
with pidfile "/var/run/elasticsearch.pid"
group elasticsearch
start = "/usr/sbin/service elasticsearch start" with timeout 60 seconds
stop = "/usr/sbin/service elasticsearch stop"
if cpu > 90% for 5 cycles then alert
if totalmem > 90% for 15 cycles then alert
if loadavg(15min) greater than 10 for 50 cycles then alert
if not exist for 3 cycles then restart
if 5 restarts within 5 cycles then timeout
if 3 restarts within 5 cycles then alert
check host elasticsearch_connection with address {{ ip4_interfaces[0]|default('0.0.0.0') }}
if failed url http://{{ ip4_interfaces[0]|default('0.0.0.0') }}:{{ elastic_port|default(9200) }}/
with timeout 15 seconds
then alert
group elasticsearch
check host elasticsearch_cluster_health with address {{ ip4_interfaces[0]|default('0.0.0.0') }}
if failed url http://{{ ip4_interfaces[0]|default('0.0.0.0') }}:{{ elastic_port|default(9200) }}/_cluster/health
and content == 'green'
with timeout 60 seconds
then alert
group elasticsearch
check process exim4
with pidfile "/var/run/exim4/exim.pid"
group mail
start = "/usr/sbin/service exim4 start"
stop = "/usr/sbin/service exim4 stop"
if failed port 25 protocol SMTP then restart
if not exist for 3 cycles then restart
if 3 restarts within 5 cycles then alert
if 5 restarts within 5 cycles then timeout
check process gdnsd
with pidfile "/var/run/gdnsd/gdnsd.pid"
start = "/usr/sbin/service gdnsd start"
stop = "/usr/sbin/service gdnsd stop"
if failed port 53 protocol DNS then restart
if not exist for 3 cycles then restart
if 5 restarts within 5 cycles then timeout
if 3 restarts within 5 cycles then alert
# http://www.alphadevx.com/a/392-Monitoring-Memcache-with-Monit
check process memcached
with pidfile "/var/run/memcached.pid"
group keystore
start = "/usr/sbin/service memcached start"
stop = "/usr/sbin/service memcached stop"
if failed host 127.0.0.1 port 11211 protocol MEMCACHE then restart
if not exist for 3 cycles then restart
if 3 restarts within 5 cycles then alert
if 5 restarts within 5 cycles then timeout
# Managed by Salt Stack, please DO NOT TOUCH, or ALL CHANGES WILL be LOST!
#
# This file should contain only what’s common for EVERY nodes
#
#
# Ref:
# - http://mmonit.com/monit/documentation/monit.html
#
check system {{ nodename }}
if loadavg (1min) > 4 then alert
if loadavg (5min) > 2 then alert
if memory usage > 75% then alert
if swap usage > 25% then alert
if cpu usage (user) > 70% then alert
if cpu usage (system) > 30% then alert
if cpu usage (wait) > 20% then alert
check process salt-minion
with pidfile "/var/run/salt-minion.pid"
group salt
start = "/usr/sbin/service salt-minion start"
stop = "/usr/sbin/service salt-minion stop"
if not exist for 3 cycles then restart
if 5 restarts within 5 cycles then timeout
set httpd port {{ monit_port|default(2812) }} and
use address localhost # only accept connection from localhost
allow localhost # allow localhost to connect to the server and
allow 10.10.10.0/24
allow admin:{{ monit_pw }} # require user 'admin' with password that is defined in Salt Stack
allow @monit # allow users of group 'monit' to connect (rw)
allow @users readonly # allow users of group 'users' to connect readonly
set mailserver mail.{{ tld }}
using sslauto
set mail-format { from: monit@{{ nodename }} }
set alert hostmaster@{{ tld }} not on { pid ppid }
# Get config defaults
#
# Run:
#
# /usr/sbin/mysqld --print-defaults
#
check process mysql
with pidfile "/var/run/mysqld/mysqld.pid"
group database
start = "/usr/sbin/service mysql start"
stop = "/usr/sbin/service mysql stop"
if failed host {{ ip4_interfaces[0]|default('127.0.0.1') }} port 3306
protocol MYSQL then restart
if not exist for 3 cycles then restart
if 5 restarts within 5 cycles then timeout
if 3 restarts within 5 cycles then alert
check process nginx
with pidfile /var/run/nginx.pid
group www
group nginx
start program = "/usr/sbin/service nginx start"
stop program = "/usr/sbin/service nginx stop"
if failed host {{ ip4_interfaces[0]|default('127.0.0.1') }} port 80
protocol HTTP then restart
if 5 restarts with 5 cycles then timeout
depend nginx_bin
depend nginx_rc
check file nginx_bin with path /usr/sbin/nginx
group nginx
include /etc/monit/templates/rootbin
check file nginx_rc with path /etc/init.d/nginx
group nginx
include /etc/monit/templates/rootbin
{#
# Expected variables and values:
# - hypothesis_host: '127.0.0.1'
# - hypothesis_port: 8000
# - elastic_host: '10.10.10.2'
# - elastic_port: 9002
#}
check process hypothesis
matching "notes-server"
start program = "/usr/sbin/service hypothesis start"
stop program = "/usr/sbin/service hypothesis stop"
if failed port {{ hypothesis_port }}
type TCP with timeout 10 seconds
then restart
if 5 restarts with 5 cycles then timeout
check host elasticsearch-remote with address "{{ elastic_host }}"
if failed port {{ elastic_port }} protocol HTTP
request "/_aliases"
with timeout 10 seconds
then alert
check host hypothesis-available with address "{{ hypothesis_host }}"
start program = "/usr/sbin/service hypothesis start"
stop program = "/usr/sbin/service hypothesis stop"
if failed port {{ hypothesis_port }}
protocol HTTP request "/ruok" with timeout 10 seconds
then restart
#
# Ref:
# - https://github.com/twitter/twemproxy
#
# To check stats, look result on stats port 22222
#
# curl http://localhost:22222 | python -m json.tool
#
check process nutcracker
matching "nutcracker"
group keystore
start = "/usr/sbin/service nutcracker start"
stop = "/usr/sbin/service nutcracker stop"
if failed host 127.0.0.1 port 22222 type TCP
with timeout 2 seconds
then restart
if not exist for 3 cycles then restart
if 3 restarts within 5 cycles then alert
if 5 restarts within 5 cycles then timeout
# http://tobias.is/blog/to-boldly-monitor-what-no-one-has-monitored-before/
check process php5-fpm
with pidfile "/var/run/php5-fpm.pid"
group php5-fpm
start = "/usr/sbin/service php5-fpm start" with timeout 60 seconds
stop = "/usr/sbin/service php5-fpm stop"
if failed host {{ ip4_interface|default('0.0.0.0') }} port {{ fpm_port|default(9000) }} type TCP then restart
if not exist for 3 cycles then restart
if 5 restarts within 5 cycles then timeout
if 3 restarts within 5 cycles then alert
check process salt-master
with pidfile "/var/run/salt-master.pid"
group salt
start = "/usr/sbin/service salt-master start"
stop = "/usr/sbin/service salt-master stop"
if not exist for 3 cycles then restart
if 3 restarts within 5 cycles then alert
if 5 restarts within 5 cycles then timeout
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment