Create a gist now

Instantly share code, notes, and snippets.

ALERT instance_down
IF up == 0
FOR 5m
LABELS { severity = "critical" }
ANNOTATIONS {
summary = "Instance {{ $labels.instance }} down",
description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.",
}
ALERT cpu_threshold_exceeded
IF (100 * (1 - avg by(instance)(irate(node_cpu{job='node',mode='idle'}[5m])))) > THRESHOLD_CPU
ANNOTATIONS {
summary = "Instance {{ $labels.instance }} CPU usage is dangerously high",
description = "This device's cpu usage has exceeded the threshold with a value of {{ $value }}.",
}
ALERT mem_threshold_exceeded
IF (node_memory_MemFree{job='node'} + node_memory_Cached{job='node'} + node_memory_Buffers{job='node'})/1000000 < THRESHOLD_MEM
ANNOTATIONS {
summary = "Instance {{ $labels.instance }} memory usage is dangerously high",
description = "This device's memory usage has exceeded the threshold with a value of {{ $value }}.",
}
ALERT filesystem_threshold_exceeded
IF node_filesystem_avail{job='node',mountpoint='/'} / node_filesystem_size{job='node'} * 100 < THRESHOLD_FS
ANNOTATIONS {
summary = "Instance {{ $labels.instance }} filesystem usage is dangerously high",
description = "This device's filesystem usage has exceeded the threshold with a value of {{ $value }}.",
}
ALERT node_high_loadaverage
IF rate(node_load1[1m]) > 2
FOR 10s
LABELS { severity = "warning" }
ANNOTATIONS {
summary = "High load average on {{$labels.instance}}",
description = "{{$labels.instance}} has a high load average above 10s (current value: {{$value}})"
}
ALERT httpd_down
IF node_httpd_count == 0
FOR 5m
LABELS { severity = "fatal" }
ANNOTATIONS {
summary = "Httpd down on {{$labels.instance}} for {{$labels.service}}",
description = "Httpd on {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.",
}
ALERT httpd_high
IF avg(node_httpd_count) by (group, service) > avg(node_httpd_max_clients) by (group, service)
FOR 5m
LABELS { severity = "warning" }
ANNOTATIONS {
summary = "High httpd count on {{$labels.instance}} for {{$labels.service}}",
description = "{{$labels.instance}} has a high httpd process count above 5m (current value: {{$value}})",
}
ALERT mysqld_down
IF node_mysqld_Threads_running == 0
FOR 5m
LABELS { severity = "fatal" }
ANNOTATIONS {
summary = "mysqld down on {{$labels.instance}}",
description = "mysqld on {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.",
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment