Skip to content

Instantly share code, notes, and snippets.

@Dnile
Created January 18, 2016 20:33
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save Dnile/35954815fec1baf14417 to your computer and use it in GitHub Desktop.
Save Dnile/35954815fec1baf14417 to your computer and use it in GitHub Desktop.
#host running out of memory!
ALERT HighMem
IF 100 -(node_memory_MemFree + node_memory_Buffers + node_memory_Cached) / node_memory_MemTotal* 100 > 80
FOR 1m
WITH {
severity="page"
}
SUMMARY "Instance {{$labels.host}} has high memory consumption"
DESCRIPTION "{{$labels.host}} of job {{$labels.job}} has less than 40% of memory available for more than 1 minutes."
#High LOAD PROM ALERT
ALERT HighLoad
IF node_load1{host=~'prom'} > 2
FOR 1m
WITH {
severity="page"
}
SUMMARY "Instance {{$labels.host}} {{$labels.instance}} has high load"
DESCRIPTION "{{$labels.host}} of job {{$labels.job}} has high load for more than 1 minutes."
#High CPU Alert
ALERT HighCPU
IF sum by (host,mode)(irate(node_cpu{mode='user'}[5m])) * 100 / scalar(count(count by (cpu)(node_cpu))) > 50
FOR 1m
WITH {
severity="page"
}
SUMMARY "Instance {{$labels.host}} {{$labels.instance}} has high cpu"
DESCRIPTION "{{$labels.host}} of job {{$labels.job}} has been consuming > 50% cpi for more than 1 minutes."
#SBM ALERT
ALERT MysqlSlaveLag
IF mysql_slave_status_seconds_behind_master > 1
FOR 1m
WITH {
severity="page"
}
SUMMARY "Instance {{$labels.host}} {{$labels.instance}} more than 1 seconds behind master"
DESCRIPTION "{{$labels.host}} of job {{$labels.job}} is currently experiencing replication lag"
#LATENCY ALERT
ALERT ELBHighLatency
IF avg(aws_elb_latency_average{load_balancer_name='production-mobile-lb'}) *1000 > 120
FOR 1m
WITH {
severity="page"
}
SUMMARY "latency > 120ms on ELB"
DESCRIPTION "Latency is higher than 120ms for more than 1 minutes"
#DISK SPACE
ALERT LowDiskSpace
IF 100 - node_filesystem_free{job="node_exporter",fstype!='xfs'} / node_filesystem_size{job="node_exporter", fstype!='xfs'} * 100 > 70
FOR 1m
WITH {
severity="page"
}
SUMMARY "disk space on {{$labels.host}} {{$labels.instance}} lower than 30%"
DESCRIPTION "disk space on {{$labels.host}} {{$labels.instance}} lower than 30%"
ALERT ELBUnHealthyHosts
IF sum(aws_elb_un_healthy_host_count_average{load_balancer_name="production-lb"}) > 0
FOR 1m
WITH {
severity="page"
}
SUMMARY "elb has >0 unhealthy hosts"
DESCRIPTION "elb has >0 unhealthy hosts"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment