WebPlatformDocs/00-monit-configuration.md

## 00-monit-configuration.md

      
    Raw
  

              00-monit-configuration.md
            
          
    Monit configuration on WebPlatform infrastructure

The monit project has two components; Monit, and M/Monit. While monit is open-source and can be used stand alone along with an independent web ui, M/Monit can manage from one place every machine that runs monit.
M/Monit can control monit configuration, creates a global cluster health graph all in "one place".
If the management don’t want to pay for a M/Monit licence its possible to shim a dashboard with a set of well-thought NGINX proxy configuration. While it would allow to control every nodes, there would be no overall network graphs.
Screenshots

Monit

Here are a few screenshots of monit client HTTP server reports.

https://docs.webplatform.org/wiki/File:monit-server-status-screenshot-service-detail-view.png
https://docs.webplatform.org/wiki/File:monit-server-status-screenshot-app-VM-type.png

M/Monit

Those are reports generated from M/Monit server that reads updates from a set of monit servers. Each monit client can send updates through their local set mmonit http://foo:bar@mmonit:8080/collector config).

https://docs.webplatform.org/wiki/File:monit_dashboard_201502_home.png
https://docs.webplatform.org/wiki/File:monit_dashboard_201502_stats.png
https://docs.webplatform.org/wiki/File:monit_dashboard_201502_vm_detail.png


## accounts.conf
check host fxa-auth-server with address "localhost"
  start program = "/usr/sbin/service fxa-auth-server start"
  stop program = "/usr/sbin/service fxa-auth-server stop"
  if failed port 9000 protocol HTTP
    request /
    with timeout 10 seconds
    then restart
    depends on nginx
    depends on fxa-content-server

check host fxa-content-server with address "localhost"
  start program = "/usr/sbin/service fxa-content-server start"
  stop program = "/usr/sbin/service fxa-content-server stop"
  if failed port 3030 protocol HTTP
    request /
    with timeout 10 seconds
    then restart
    depends on nginx

check host fxa-oauth-server with address "localhost"
  start program = "/usr/sbin/service fxa-oauth-server start"
  stop program = "/usr/sbin/service fxa-oauth-server stop"
  if failed port 9010 protocol HTTP
    request /
    with timeout 15 seconds
    then restart
    depends on nginx

check host fxa-profile-server with address "localhost"
  start program = "/usr/sbin/service fxa-profile-server start"
  stop program = "/usr/sbin/service fxa-profile-server stop"
  if failed port 8081 protocol HTTP
    request /
    with timeout 10 seconds
    then restart
    depends on nginx

# See in salt/fxa/checks.sls
#        salt/fxa/files/profile-check.sh
check program fxa-profile-server-check with path "/srv/webplatform/auth/profile-check.sh"
  with timeout 20 seconds
  start program = "/usr/sbin/service fxa-profile-server start"
  stop program = "/usr/sbin/service fxa-profile-server stop"
  if status != 0
    then restart
    depends on fxa-profile-server

## apache.conf
# ref: http://mmonit.com/wiki/Monit/MonitorApacheStatus
check process apache2
  with pidfile "/var/run/apache2/apache2.pid"
  group www
  start = "/usr/sbin/service apache2 start"
  stop  = "/usr/sbin/service apache2 stop"
  if failed host 127.0.0.1 port 80
    protocol apache-status loglimit > 80% or
                           dnslimit > 25% or
                           waitlimit < 20%
    then restart
  if failed host {{ ip4_interfaces[0]|default('127.0.0.1') }} port 80
    protocol HTTP then restart
  if not exist for 3 cycles then restart
  if 5 restarts within 5 cycles then timeout
  if 3 restarts within 5 cycles then alert

## elasticsearch.conf
# Ref: https://github.com/elasticsearch/cookbook-elasticsearch/blob/master/templates/default/elasticsearch.monitrc.conf.erb
# #TODO Improve error email, good format example see ^
check process elasticsearch
  with pidfile "/var/run/elasticsearch.pid"
  group elasticsearch
  start = "/usr/sbin/service elasticsearch start" with timeout 60 seconds
  stop  = "/usr/sbin/service elasticsearch stop"
  if cpu > 90% for 5 cycles then alert
  if totalmem > 90% for 15 cycles then alert
  if loadavg(15min) greater than 10 for 50 cycles then alert
  if not exist for 3 cycles then restart
  if 5 restarts within 5 cycles then timeout
  if 3 restarts within 5 cycles then alert

check host elasticsearch_connection with address {{ ip4_interfaces[0]|default('0.0.0.0') }}
  if failed url http://{{ ip4_interfaces[0]|default('0.0.0.0') }}:{{ elastic_port|default(9200) }}/
  with timeout 15 seconds
  then alert
  group elasticsearch

check host elasticsearch_cluster_health with address {{ ip4_interfaces[0]|default('0.0.0.0') }}
  if failed url http://{{ ip4_interfaces[0]|default('0.0.0.0') }}:{{ elastic_port|default(9200) }}/_cluster/health
    and content == 'green'
    with timeout 60 seconds
    then alert
  group elasticsearch

## exim4.conf
check process exim4
  with pidfile "/var/run/exim4/exim.pid"
  group mail
  start = "/usr/sbin/service exim4 start"
  stop  = "/usr/sbin/service exim4 stop"
  if failed port 25 protocol SMTP then restart
  if not exist for 3 cycles then restart
  if 3 restarts within 5 cycles then alert
  if 5 restarts within 5 cycles then timeout

## gdnsd.conf
check process gdnsd
  with pidfile "/var/run/gdnsd/gdnsd.pid"
  start = "/usr/sbin/service gdnsd start"
  stop  = "/usr/sbin/service gdnsd stop"
  if failed port 53 protocol DNS then restart
  if not exist for 3 cycles then restart
  if 5 restarts within 5 cycles then timeout
  if 3 restarts within 5 cycles then alert


## memcached.conf
# http://www.alphadevx.com/a/392-Monitoring-Memcache-with-Monit
check process memcached
  with pidfile "/var/run/memcached.pid"
  group keystore
  start = "/usr/sbin/service memcached start"
  stop  = "/usr/sbin/service memcached stop"
  if failed host 127.0.0.1 port 11211 protocol MEMCACHE then restart
  if not exist for 3 cycles then restart
  if 3 restarts within 5 cycles then alert
  if 5 restarts within 5 cycles then timeout


## monit.conf
# Managed by Salt Stack, please DO NOT TOUCH, or ALL CHANGES WILL be LOST!

#
# This file should contain only what’s common for EVERY nodes
#
#
# Ref:
#   - http://mmonit.com/monit/documentation/monit.html
#


check system {{ nodename }}
  if loadavg (1min) > 4 then alert
  if loadavg (5min) > 2 then alert
  if memory usage > 75% then alert
  if swap usage > 25% then alert
  if cpu usage (user) > 70% then alert
  if cpu usage (system) > 30% then alert
  if cpu usage (wait) > 20% then alert

check process salt-minion
  with pidfile "/var/run/salt-minion.pid"
  group salt
  start = "/usr/sbin/service salt-minion start"
  stop  = "/usr/sbin/service salt-minion stop"
  if not exist for 3 cycles then restart
  if 5 restarts within 5 cycles then timeout

set httpd port {{ monit_port|default(2812) }} and
  use address localhost      # only accept connection from localhost
  allow localhost            # allow localhost to connect to the server and
  allow 10.10.10.0/24
  allow admin:{{ monit_pw }} # require user 'admin' with password that is defined in Salt Stack
  allow @monit               # allow users of group 'monit' to connect (rw)
  allow @users readonly      # allow users of group 'users' to connect readonly

set mailserver mail.{{ tld }}
  using sslauto

set mail-format { from: monit@{{ nodename }} }
set alert hostmaster@{{ tld }} not on { pid ppid }

## mysql.conf
# Get config defaults
#
# Run:
#
#       /usr/sbin/mysqld --print-defaults
#
check process mysql
  with pidfile "/var/run/mysqld/mysqld.pid"
  group database
  start = "/usr/sbin/service mysql start"
  stop  = "/usr/sbin/service mysql stop"
  if failed host {{ ip4_interfaces[0]|default('127.0.0.1') }} port 3306
    protocol MYSQL then restart
  if not exist for 3 cycles then restart
  if 5 restarts within 5 cycles then timeout
  if 3 restarts within 5 cycles then alert


## nginx.conf
check process nginx
  with pidfile /var/run/nginx.pid
  group www
  group nginx
  start program = "/usr/sbin/service nginx start"
  stop program =  "/usr/sbin/service nginx stop"
  if failed host {{ ip4_interfaces[0]|default('127.0.0.1') }} port 80
    protocol HTTP then restart
  if 5 restarts with 5 cycles then timeout
  depend nginx_bin
  depend nginx_rc

check file nginx_bin with path /usr/sbin/nginx
  group nginx
  include /etc/monit/templates/rootbin

check file nginx_rc with path /etc/init.d/nginx
  group nginx
  include /etc/monit/templates/rootbin

## notes-server.conf
{#
 # Expected variables and values:
 #   - hypothesis_host: '127.0.0.1'
 #   - hypothesis_port: 8000
 #   - elastic_host: '10.10.10.2'
 #   - elastic_port: 9002
 #}
check process hypothesis
  matching "notes-server"
  start program = "/usr/sbin/service hypothesis start"
  stop program =  "/usr/sbin/service hypothesis stop"
  if failed port {{ hypothesis_port }}
    type TCP with timeout 10 seconds
    then restart
  if 5 restarts with 5 cycles then timeout

check host elasticsearch-remote with address "{{ elastic_host }}"
  if failed port {{ elastic_port }} protocol HTTP
    request "/_aliases"
    with timeout 10 seconds
    then alert

check host hypothesis-available with address "{{ hypothesis_host }}"
  start program = "/usr/sbin/service hypothesis start"
  stop program =  "/usr/sbin/service hypothesis stop"
  if failed port {{ hypothesis_port }}
    protocol HTTP request "/ruok" with timeout 10 seconds
    then restart


## nutcracker.conf
#
# Ref:
#   - https://github.com/twitter/twemproxy
#
# To check stats, look result on stats port 22222
#
#    curl http://localhost:22222 | python -m json.tool
#
check process nutcracker
  matching "nutcracker"
  group keystore
  start = "/usr/sbin/service nutcracker start"
  stop  = "/usr/sbin/service nutcracker stop"
  if failed host 127.0.0.1 port 22222 type TCP
    with timeout 2 seconds
    then restart
  if not exist for 3 cycles then restart
  if 3 restarts within 5 cycles then alert
  if 5 restarts within 5 cycles then timeout


## php-fpm.conf
# http://tobias.is/blog/to-boldly-monitor-what-no-one-has-monitored-before/
check process php5-fpm
  with pidfile "/var/run/php5-fpm.pid"
  group php5-fpm
  start = "/usr/sbin/service php5-fpm start" with timeout 60 seconds
  stop  = "/usr/sbin/service php5-fpm stop"
  if failed host {{ ip4_interface|default('0.0.0.0') }} port {{ fpm_port|default(9000) }} type TCP then restart
  if not exist for 3 cycles then restart
  if 5 restarts within 5 cycles then timeout
  if 3 restarts within 5 cycles then alert


## salt-master.conf
check process salt-master
  with pidfile "/var/run/salt-master.pid"
  group salt
  start = "/usr/sbin/service salt-master start"
  stop  = "/usr/sbin/service salt-master stop"
  if not exist for 3 cycles then restart
  if 3 restarts within 5 cycles then alert
  if 5 restarts within 5 cycles then timeout
	check host fxa-auth-server with address "localhost"
	start program = "/usr/sbin/service fxa-auth-server start"
	stop program = "/usr/sbin/service fxa-auth-server stop"
	if failed port 9000 protocol HTTP
	request /
	with timeout 10 seconds
	then restart
	depends on nginx
	depends on fxa-content-server

	check host fxa-content-server with address "localhost"
	start program = "/usr/sbin/service fxa-content-server start"
	stop program = "/usr/sbin/service fxa-content-server stop"
	if failed port 3030 protocol HTTP
	request /
	with timeout 10 seconds
	then restart
	depends on nginx

	check host fxa-oauth-server with address "localhost"
	start program = "/usr/sbin/service fxa-oauth-server start"
	stop program = "/usr/sbin/service fxa-oauth-server stop"
	if failed port 9010 protocol HTTP
	request /
	with timeout 15 seconds
	then restart
	depends on nginx

	check host fxa-profile-server with address "localhost"
	start program = "/usr/sbin/service fxa-profile-server start"
	stop program = "/usr/sbin/service fxa-profile-server stop"
	if failed port 8081 protocol HTTP
	request /
	with timeout 10 seconds
	then restart
	depends on nginx

	# See in salt/fxa/checks.sls
	# salt/fxa/files/profile-check.sh
	check program fxa-profile-server-check with path "/srv/webplatform/auth/profile-check.sh"
	with timeout 20 seconds
	start program = "/usr/sbin/service fxa-profile-server start"
	stop program = "/usr/sbin/service fxa-profile-server stop"
	if status != 0
	then restart
	depends on fxa-profile-server
	# ref: http://mmonit.com/wiki/Monit/MonitorApacheStatus
	check process apache2
	with pidfile "/var/run/apache2/apache2.pid"
	group www
	start = "/usr/sbin/service apache2 start"
	stop = "/usr/sbin/service apache2 stop"
	if failed host 127.0.0.1 port 80
	protocol apache-status loglimit > 80% or
	dnslimit > 25% or
	waitlimit < 20%
	then restart
	if failed host {{ ip4_interfaces[0]\|default('127.0.0.1') }} port 80
	protocol HTTP then restart
	if not exist for 3 cycles then restart
	if 5 restarts within 5 cycles then timeout
	if 3 restarts within 5 cycles then alert
	# Ref: https://github.com/elasticsearch/cookbook-elasticsearch/blob/master/templates/default/elasticsearch.monitrc.conf.erb
	# #TODO Improve error email, good format example see ^
	check process elasticsearch
	with pidfile "/var/run/elasticsearch.pid"
	group elasticsearch
	start = "/usr/sbin/service elasticsearch start" with timeout 60 seconds
	stop = "/usr/sbin/service elasticsearch stop"
	if cpu > 90% for 5 cycles then alert
	if totalmem > 90% for 15 cycles then alert
	if loadavg(15min) greater than 10 for 50 cycles then alert
	if not exist for 3 cycles then restart
	if 5 restarts within 5 cycles then timeout
	if 3 restarts within 5 cycles then alert

	check host elasticsearch_connection with address {{ ip4_interfaces[0]\|default('0.0.0.0') }}
	if failed url http://{{ ip4_interfaces[0]\|default('0.0.0.0') }}:{{ elastic_port\|default(9200) }}/
	with timeout 15 seconds
	then alert
	group elasticsearch

	check host elasticsearch_cluster_health with address {{ ip4_interfaces[0]\|default('0.0.0.0') }}
	if failed url http://{{ ip4_interfaces[0]\|default('0.0.0.0') }}:{{ elastic_port\|default(9200) }}/_cluster/health
	and content == 'green'
	with timeout 60 seconds
	then alert
	group elasticsearch
	check process exim4
	with pidfile "/var/run/exim4/exim.pid"
	group mail
	start = "/usr/sbin/service exim4 start"
	stop = "/usr/sbin/service exim4 stop"
	if failed port 25 protocol SMTP then restart
	if not exist for 3 cycles then restart
	if 3 restarts within 5 cycles then alert
	if 5 restarts within 5 cycles then timeout
	check process gdnsd
	with pidfile "/var/run/gdnsd/gdnsd.pid"
	start = "/usr/sbin/service gdnsd start"
	stop = "/usr/sbin/service gdnsd stop"
	if failed port 53 protocol DNS then restart
	if not exist for 3 cycles then restart
	if 5 restarts within 5 cycles then timeout
	if 3 restarts within 5 cycles then alert
	# http://www.alphadevx.com/a/392-Monitoring-Memcache-with-Monit
	check process memcached
	with pidfile "/var/run/memcached.pid"
	group keystore
	start = "/usr/sbin/service memcached start"
	stop = "/usr/sbin/service memcached stop"
	if failed host 127.0.0.1 port 11211 protocol MEMCACHE then restart
	if not exist for 3 cycles then restart
	if 3 restarts within 5 cycles then alert
	if 5 restarts within 5 cycles then timeout
	# Managed by Salt Stack, please DO NOT TOUCH, or ALL CHANGES WILL be LOST!

	#
	# This file should contain only what’s common for EVERY nodes
	#
	#
	# Ref:
	# - http://mmonit.com/monit/documentation/monit.html
	#



	check system {{ nodename }}
	if loadavg (1min) > 4 then alert
	if loadavg (5min) > 2 then alert
	if memory usage > 75% then alert
	if swap usage > 25% then alert
	if cpu usage (user) > 70% then alert
	if cpu usage (system) > 30% then alert
	if cpu usage (wait) > 20% then alert

	check process salt-minion
	with pidfile "/var/run/salt-minion.pid"
	group salt
	start = "/usr/sbin/service salt-minion start"
	stop = "/usr/sbin/service salt-minion stop"
	if not exist for 3 cycles then restart
	if 5 restarts within 5 cycles then timeout

	set httpd port {{ monit_port\|default(2812) }} and
	use address localhost # only accept connection from localhost
	allow localhost # allow localhost to connect to the server and
	allow 10.10.10.0/24
	allow admin:{{ monit_pw }} # require user 'admin' with password that is defined in Salt Stack
	allow @monit # allow users of group 'monit' to connect (rw)
	allow @users readonly # allow users of group 'users' to connect readonly

	set mailserver mail.{{ tld }}
	using sslauto

	set mail-format { from: monit@{{ nodename }} }
	set alert hostmaster@{{ tld }} not on { pid ppid }
	# Get config defaults
	#
	# Run:
	#
	# /usr/sbin/mysqld --print-defaults
	#
	check process mysql
	with pidfile "/var/run/mysqld/mysqld.pid"
	group database
	start = "/usr/sbin/service mysql start"
	stop = "/usr/sbin/service mysql stop"
	if failed host {{ ip4_interfaces[0]\|default('127.0.0.1') }} port 3306
	protocol MYSQL then restart
	if not exist for 3 cycles then restart
	if 5 restarts within 5 cycles then timeout
	if 3 restarts within 5 cycles then alert
	check process nginx
	with pidfile /var/run/nginx.pid
	group www
	group nginx
	start program = "/usr/sbin/service nginx start"
	stop program = "/usr/sbin/service nginx stop"
	if failed host {{ ip4_interfaces[0]\|default('127.0.0.1') }} port 80
	protocol HTTP then restart
	if 5 restarts with 5 cycles then timeout
	depend nginx_bin
	depend nginx_rc

	check file nginx_bin with path /usr/sbin/nginx
	group nginx
	include /etc/monit/templates/rootbin

	check file nginx_rc with path /etc/init.d/nginx
	group nginx
	include /etc/monit/templates/rootbin
	{#
	# Expected variables and values:
	# - hypothesis_host: '127.0.0.1'
	# - hypothesis_port: 8000
	# - elastic_host: '10.10.10.2'
	# - elastic_port: 9002
	#}
	check process hypothesis
	matching "notes-server"
	start program = "/usr/sbin/service hypothesis start"
	stop program = "/usr/sbin/service hypothesis stop"
	if failed port {{ hypothesis_port }}
	type TCP with timeout 10 seconds
	then restart
	if 5 restarts with 5 cycles then timeout

	check host elasticsearch-remote with address "{{ elastic_host }}"
	if failed port {{ elastic_port }} protocol HTTP
	request "/_aliases"
	with timeout 10 seconds
	then alert

	check host hypothesis-available with address "{{ hypothesis_host }}"
	start program = "/usr/sbin/service hypothesis start"
	stop program = "/usr/sbin/service hypothesis stop"
	if failed port {{ hypothesis_port }}
	protocol HTTP request "/ruok" with timeout 10 seconds
	then restart