Last active
November 10, 2024 20:09
-
-
Save krisek/62a98e2645af5dce169a7b506e999cd8 to your computer and use it in GitHub Desktop.
Prometheus alert rules for node exporter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
groups: | |
- name: node_exporter_alerts | |
rules: | |
- alert: Node down | |
expr: up{job="monitoring-pi"} == 0 | |
for: 2m | |
labels: | |
severity: warning | |
annotations: | |
title: Node {{ $labels.instance }} is down | |
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance }} for more than 2 minutes. Node seems down. | |
- alert: HostOutOfMemory | |
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 | |
for: 2m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host out of memory (instance {{ $labels.instance }}) | |
description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }} | |
- alert: HostMemoryUnderMemoryPressure | |
expr: rate(node_vmstat_pgmajfault[1m]) > 1000 | |
for: 2m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host memory under memory pressure (instance {{ $labels.instance }}) | |
description: The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }} | |
- alert: HostUnusualNetworkThroughputIn | |
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 | |
for: 5m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host unusual network throughput in (instance {{ $labels.instance }}) | |
description: Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }} | |
- alert: HostUnusualNetworkThroughputOut | |
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 | |
for: 5m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host unusual network throughput out (instance {{ $labels.instance }}) | |
description: Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }} | |
- alert: HostUnusualDiskReadRate | |
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 | |
for: 5m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host unusual disk read rate (instance {{ $labels.instance }}) | |
description: Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }} | |
- alert: HostUnusualDiskWriteRate | |
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 | |
for: 2m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host unusual disk write rate (instance {{ $labels.instance }}) | |
description: Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }} | |
# Please add ignored mountpoints in node_exporter parameters like | |
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". | |
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. | |
- alert: HostOutOfDiskSpace | |
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 | |
for: 2m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host out of disk space (instance {{ $labels.instance }}) | |
description: Disk is almost full (< 10% left)\n VALUE = {{ $value }} | |
# Please add ignored mountpoints in node_exporter parameters like | |
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". | |
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. | |
- alert: HostDiskWillFillIn24Hours | |
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 | |
for: 2m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) | |
description: Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }} | |
- alert: HostOutOfInodes | |
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 | |
for: 2m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host out of inodes (instance {{ $labels.instance }}) | |
description: Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }} | |
- alert: HostInodesWillFillIn24Hours | |
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 | |
for: 2m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) | |
description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }} | |
- alert: HostUnusualDiskReadLatency | |
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0 | |
for: 2m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host unusual disk read latency (instance {{ $labels.instance }}) | |
description: Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }} | |
- alert: HostUnusualDiskWriteLatency | |
expr: rate(node_disk_write_time_seconds_totali{device!~"mmcblk.+"}[1m]) / rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0 | |
for: 2m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host unusual disk write latency (instance {{ $labels.instance }}) | |
description: Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }} | |
- alert: HostHighCpuLoad | |
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80 | |
for: 0m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host high CPU load (instance {{ $labels.instance }}) | |
description: CPU load is > 80%\n VALUE = {{ $value }} | |
- alert: HostCpuStealNoisyNeighbor | |
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 | |
for: 0m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) | |
description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }} | |
# 1000 context switches is an arbitrary number. | |
# Alert threshold depends on nature of application. | |
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 | |
- alert: HostContextSwitching | |
expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000 | |
for: 0m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host context switching (instance {{ $labels.instance }}) | |
description: Context switching is growing on node (> 1000 / s)\n VALUE = {{ $value }} | |
- alert: HostSwapIsFillingUp | |
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 | |
for: 2m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host swap is filling up (instance {{ $labels.instance }}) | |
description: Swap is filling up (>80%)\n VALUE = {{ $value }} | |
- alert: HostSystemdServiceCrashed | |
expr: node_systemd_unit_state{state="failed"} == 1 | |
for: 0m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host SystemD service crashed (instance {{ $labels.instance }}) | |
description: SystemD service crashed\n VALUE = {{ $value }} | |
- alert: HostPhysicalComponentTooHot | |
expr: node_hwmon_temp_celsius > 75 | |
for: 5m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host physical component too hot (instance {{ $labels.instance }}) | |
description: Physical hardware component too hot\n VALUE = {{ $value }} | |
- alert: HostNodeOvertemperatureAlarm | |
expr: node_hwmon_temp_crit_alarm_celsius == 1 | |
for: 0m | |
labels: | |
severity: critical | |
annotations: | |
summary: Host node overtemperature alarm (instance {{ $labels.instance }}) | |
description: Physical node temperature alarm triggered\n VALUE = {{ $value }} | |
- alert: HostRaidArrayGotInactive | |
expr: node_md_state{state="inactive"} > 0 | |
for: 0m | |
labels: | |
severity: critical | |
annotations: | |
summary: Host RAID array got inactive (instance {{ $labels.instance }}) | |
description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }} | |
- alert: HostRaidDiskFailure | |
expr: node_md_disks{state="failed"} > 0 | |
for: 2m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host RAID disk failure (instance {{ $labels.instance }}) | |
description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }} | |
- alert: HostKernelVersionDeviations | |
expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1 | |
for: 6h | |
labels: | |
severity: warning | |
annotations: | |
summary: Host kernel version deviations (instance {{ $labels.instance }}) | |
description: Different kernel versions are running\n VALUE = {{ $value }} | |
- alert: HostOomKillDetected | |
expr: increase(node_vmstat_oom_kill[1m]) > 0 | |
for: 0m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host OOM kill detected (instance {{ $labels.instance }}) | |
description: OOM kill detected\n VALUE = {{ $value }} | |
- alert: HostEdacCorrectableErrorsDetected | |
expr: increase(node_edac_correctable_errors_total[1m]) > 0 | |
for: 0m | |
labels: | |
severity: info | |
annotations: | |
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) | |
description: Instance has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }} | |
- alert: HostEdacUncorrectableErrorsDetected | |
expr: node_edac_uncorrectable_errors_total > 0 | |
for: 0m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) | |
description: Instance has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }} | |
- alert: HostNetworkReceiveErrors | |
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 | |
for: 2m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host Network Receive Errors (instance {{ $labels.instance }}:{{ $labels.device }}) | |
description: Instance interface has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.\n VALUE = {{ $value }} | |
- alert: HostNetworkTransmitErrors | |
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 | |
for: 2m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host Network Transmit Errors (instance {{ $labels.instance }}:{{ $labels.device }}) | |
description: Instance has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.\n VALUE = {{ $value }} | |
- alert: HostNetworkInterfaceSaturated | |
expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 | |
for: 1m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host Network Interface Saturated (instance {{ $labels.instance }}:{{ $labels.interface }}) | |
description: The network interface is getting overloaded.\n VALUE = {{ $value }} | |
- alert: HostConntrackLimit | |
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 | |
for: 5m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host conntrack limit (instance {{ $labels.instance }}) | |
description: The number of conntrack is approching limit\n VALUE = {{ $value }} | |
- alert: HostClockSkew | |
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) | |
for: 2m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host clock skew (instance {{ $labels.instance }}) | |
description: Clock skew detected. Clock is out of sync.\n VALUE = {{ $value }} | |
- alert: HostClockNotSynchronising | |
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16 | |
for: 2m | |
labels: | |
severity: warning | |
annotations: | |
summary: Host clock not synchronising (instance {{ $labels.instance }}) | |
description: Clock not synchronising.\n VALUE = {{ $value }} | |
thanks a lot
rules1m.yaml
🛠️ Load Average 1m > 2
Apache Server Load > 1
📡 Mucho tráfico OUT > 600MB/s
Apache Response Time
Mucho tráfico IN > 600MB/s
🔥 CPU Temperature Router +64º
⚡NF Conntrack > 16.000
MySQL Too much connections 60%
MySQL Slow Queries
MySQL QPS > 400s
apiVersion: 1
groups:
- orgId: 1
name: rules30s
folder: rules
interval: 30s
rules:
- uid: ce0xw3r63atq8c
title: "\U0001F6E0️ CPU Load Average 1m > 2"
condition: C
data:
- refId: A
relativeTimeRange:
from: 60
to: 0
datasourceUid: de05xuoi6cav4b
model:
datasource:
type: prometheus
uid: de05xuoi6cav4b
disableTextWrap: false
editorMode: builder
expr: node_load1
fullMetaSearch: false
includeNullMetadata: true
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
useBackend: false
- refId: C
relativeTimeRange:
from: 60
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 2
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 1m
annotations: {}
labels: {}
isPaused: false
notification_settings:
receiver: grafana-default-email
- uid: be0xx1u22eccge
title: Apache Server Load > 1.40
condition: C
data:
- refId: A
relativeTimeRange:
from: 60
to: 0
datasourceUid: de05xuoi6cav4b
model:
datasource:
type: prometheus
uid: de05xuoi6cav4b
disableTextWrap: false
editorMode: builder
expr: apache_load{interval="1min"}
fullMetaSearch: false
includeNullMetadata: true
instant: true
intervalMs: 10000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
useBackend: false
- refId: C
relativeTimeRange:
from: 60
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 1.4
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 1m
annotations: {}
labels: {}
isPaused: false
notification_settings:
receiver: grafana-default-email
- uid: be0yeu47va1a8e
title: "\U0001F4E1 Mucho tráfico OUT > 600MB/s"
condition: C
data:
- refId: A
relativeTimeRange:
from: 60
to: 0
datasourceUid: de05xuoi6cav4b
model:
datasource:
type: prometheus
uid: de05xuoi6cav4b
editorMode: code
expr: sum by (instance) (rate(node_network_transmit_bytes_total[1m])) / 1024 / 1024
instant: true
intervalMs: 10000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
relativeTimeRange:
from: 60
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 60
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 1m
isPaused: false
notification_settings:
receiver: grafana-default-email
- uid: ae0yfkh2q8ohsf
title: Apache Response Time
condition: C
data:
- refId: A
relativeTimeRange:
from: 60
to: 0
datasourceUid: de05xuoi6cav4b
model:
datasource:
type: prometheus
uid: de05xuoi6cav4b
editorMode: code
exemplar: true
expr: sum(rate(apache_duration_ms_total{instance=~"localhost:9117"}[1m])) / sum(rate(apache_accesses_total{instance=~"localhost:9117"}[1m]))
format: time_series
instant: true
interval: ""
intervalFactor: 1
intervalMs: 15000
legendFormat: Time
maxDataPoints: 43200
range: false
refId: A
step: 240
- refId: C
relativeTimeRange:
from: 60
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0.09
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
dashboardUid: Jc1gx9hVk
panelId: 1
noDataState: NoData
execErrState: Error
for: 1m
annotations:
__dashboardUid__: Jc1gx9hVk
__panelId__: "1"
isPaused: false
notification_settings:
receiver: grafana-default-email
- uid: fe0zcct15wp34f
title: Mucho tráfico IN > 600MB/s
condition: C
data:
- refId: A
relativeTimeRange:
from: 60
to: 0
datasourceUid: de05xuoi6cav4b
model:
datasource:
type: prometheus
uid: de05xuoi6cav4b
editorMode: code
expr: sum by (instance) (rate(node_network_receive_bytes_total[1m])) / 1024 / 1024
instant: true
intervalMs: 10000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
relativeTimeRange:
from: 60
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 60
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 1m
isPaused: false
notification_settings:
receiver: grafana-default-email
- uid: ae0zgn59v0ruoe
title: "\U0001F525 CPU Temperature Router +65º"
condition: B
data:
- refId: CPU
relativeTimeRange:
from: 60
to: 0
datasourceUid: adzvde9z38cg0a
model:
datasource:
type: influxdb
uid: adzvde9z38cg0a
groupBy:
- params:
- $__interval
type: time
- params:
- "null"
type: fill
intervalMs: 10000
maxDataPoints: 43200
orderByTime: ASC
policy: default
query: "from(bucket: \"RedesZoneBUCKET\")\r\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\r\n |> filter(fn: (r) =>\r\n r._measurement == \"router_asus\" and\r\n r._field == \"temp_CPU\" \r\n )\r\n// |> drop(columns: [\"container_version\", \"engine_host\", \"host\", \"server_version\"])\r\n //|> aggregateWindow(every: 5m, fn: mean)\r\n |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)\r\n |> yield(name: \"mean\")"
refId: CPU
resultFormat: time_series
select:
- - params:
- value
type: field
- params: []
type: mean
tags: []
- refId: A
relativeTimeRange:
from: 60
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params: []
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: CPU
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: A
type: reduce
- refId: B
relativeTimeRange:
from: 60
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 65
- 0
type: gt
operator:
type: and
query:
params: []
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
dashboardUid: jY_JZIlGz
panelId: 37
noDataState: NoData
execErrState: Error
for: 1m
annotations:
__dashboardUid__: jY_JZIlGz
__panelId__: "37"
labels: {}
isPaused: false
notification_settings:
receiver: grafana-default-email
- uid: ce0zkl6b8piwwf
title: ⚡NF Conntrack > 20k
condition: C
data:
- refId: A
relativeTimeRange:
from: 60
to: 0
datasourceUid: de05xuoi6cav4b
model:
datasource:
type: prometheus
uid: de05xuoi6cav4b
expr: node_nf_conntrack_entries{instance="localhost:9100",job="node_exporter"}
format: time_series
interval: ""
intervalFactor: 1
intervalMs: 15000
legendFormat: NF conntrack entries
maxDataPoints: 43200
refId: A
step: 240
- refId: B
relativeTimeRange:
from: 60
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: []
type: gt
operator:
type: and
query:
params:
- B
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: B
type: reduce
- refId: C
relativeTimeRange:
from: 60
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 20000
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: B
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
dashboardUid: publicok
panelId: 61
noDataState: NoData
execErrState: Error
for: 1m
annotations:
__dashboardUid__: publicok
__panelId__: "61"
labels: {}
isPaused: false
notification_settings:
receiver: grafana-default-email
- uid: be11xgopyuvb4d
title: MySQL Too much connections 60%
condition: C
data:
- refId: A
relativeTimeRange:
from: 60
to: 0
datasourceUid: de05xuoi6cav4b
model:
datasource:
type: prometheus
uid: de05xuoi6cav4b
editorMode: code
expr: max_over_time(mysql_global_status_threads_connected[1m]) / (mysql_global_variables_max_connections * 100)
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
relativeTimeRange:
from: 60
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 60
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 1m
isPaused: false
notification_settings:
receiver: grafana-default-email
- uid: fe1235gx41czke
title: MySQL Slow Queries
condition: C
data:
- refId: A
relativeTimeRange:
from: 60
to: 0
datasourceUid: de05xuoi6cav4b
model:
datasource:
type: prometheus
uid: de05xuoi6cav4b
editorMode: code
expr: mysql_global_status_slow_queries[1m]
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
relativeTimeRange:
from: 60
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 50
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: B
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
- refId: B
relativeTimeRange:
from: 60
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params: []
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: A
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: B
type: reduce
noDataState: NoData
execErrState: Error
for: 1m
isPaused: false
notification_settings:
receiver: grafana-default-email
- uid: de128sontu29sc
title: MySQL QPS > 400s
condition: C
data:
- refId: A
relativeTimeRange:
from: 60
to: 0
datasourceUid: de05xuoi6cav4b
model:
datasource:
type: prometheus
uid: de05xuoi6cav4b
editorMode: code
expr: rate(mysql_global_status_questions[1m])
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
relativeTimeRange:
from: 60
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 400
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 1m
annotations: {}
labels: {}
isPaused: false
notification_settings:
receiver: grafana-default-email
- uid: ce12wohr1m51cd
title: Cloudflare http 5xx error rate
condition: C
data:
- refId: A
relativeTimeRange:
from: 60
to: 0
datasourceUid: de05xuoi6cav4b
model:
datasource:
type: prometheus
uid: de05xuoi6cav4b
disableTextWrap: false
editorMode: code
expr: increase(cloudflare_zone_requests_status{status=~"^5.."}[5m])
fullMetaSearch: false
includeNullMetadata: true
instant: true
intervalMs: 15000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
useBackend: false
- refId: C
relativeTimeRange:
from: 60
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 10
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 1m
annotations: {}
labels: {}
isPaused: true
notification_settings:
receiver: grafana-default-email
- uid: be12xj80cqmf4c
title: Rate limiting rules 429 CloudFLare
condition: C
data:
- refId: A
relativeTimeRange:
from: 60
to: 0
datasourceUid: de05xuoi6cav4b
model:
datasource:
type: prometheus
uid: de05xuoi6cav4b
disableTextWrap: false
editorMode: code
expr: increase(cloudflare_zone_requests_status{status="429"}[2m])
fullMetaSearch: false
includeNullMetadata: true
instant: true
intervalMs: 60000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
useBackend: false
- refId: C
relativeTimeRange:
from: 60
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 15
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 1m
isPaused: true
notification_settings:
receiver: grafana-default-email
rules1m.yaml
apologies, I don't see how this relates to the gist, I think Grafana has some way of sharing/publishing dashboards, maybe this belongs there
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you so much!